Add support for --seq_limit in Jackhmmer - significantly reduces peak RAM use

* See https://github.com/EddyRivasLab/hmmer/issues/323 for more context.
* We observed 75.6 GB -> 13.6 GB peak RAM reduction for one of our queries.

PiperOrigin-RevId: 782808687
Change-Id: I4306dc6921015c88c5f8ced69a4ef46e10574a57
This commit is contained in:
Augustin Zidek
2025-07-14 01:59:30 -07:00
committed by Copybara-Service
parent a209008f6f
commit 751a4b8612
4 changed files with 65 additions and 5 deletions

View File

@ -31,14 +31,23 @@ RUN pip3 install --no-cache-dir --upgrade pip
# Install HMMER. Do so before copying the source code, so that docker can cache
# the image layer containing HMMER. Alternatively, you could also install it
# using `apt-get install hmmer` instead of bulding it from source, but we want
# to have control over the exact version of HMMER. Also note that eddylab.org
# unfortunately doesn't support HTTPS and the tar file published on GitHub is
# explicitly not recommended to be used for building from source.
# to have control over the exact version of HMMER and also apply the sequence
# limit patch. Also note that eddylab.org unfortunately doesn't support HTTPS
# and the tar file published on GitHub is explicitly not recommended to be used
# for building from source.
# Download, check hash, and extract the HMMER source code.
RUN mkdir /hmmer_build /hmmer ; \
wget http://eddylab.org/software/hmmer/hmmer-3.4.tar.gz --directory-prefix /hmmer_build ; \
(cd /hmmer_build && echo "ca70d94fd0cf271bd7063423aabb116d42de533117343a9b27a65c17ff06fbf3 hmmer-3.4.tar.gz" | sha256sum --check) && \
(cd /hmmer_build && tar zxf hmmer-3.4.tar.gz && rm hmmer-3.4.tar.gz) ; \
(cd /hmmer_build/hmmer-3.4 && ./configure --prefix /hmmer) ; \
(cd /hmmer_build && tar zxf hmmer-3.4.tar.gz && rm hmmer-3.4.tar.gz)
# Apply the --seq_limit patch to HMMER.
COPY docker/jackhmmer_seq_limit.patch /hmmer_build/
RUN (cd /hmmer_build && patch -p0 < jackhmmer_seq_limit.patch)
# Build HMMER.
RUN (cd /hmmer_build/hmmer-3.4 && ./configure --prefix /hmmer) ; \
(cd /hmmer_build/hmmer-3.4 && make -j) ; \
(cd /hmmer_build/hmmer-3.4 && make install) ; \
(cd /hmmer_build/hmmer-3.4/easel && make install) ; \

View File

@ -0,0 +1,32 @@
--- hmmer-3.4/src/jackhmmer.c
+++ hmmer-3.4/src/jackhmmer.c
@@ -73,6 +73,7 @@ static ESL_OPTIONS options[] = {
{ "--noali", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "don't output alignments, so output is smaller", 2 },
{ "--notextw", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, "--textw", "unlimit ASCII text output line width", 2 },
{ "--textw", eslARG_INT, "120", NULL, "n>=120", NULL, NULL, "--notextw", "set max width of ASCII text output lines", 2 },
+ { "--seq_limit", eslARG_INT, NULL, NULL, NULL, NULL, NULL, "--seq_limit", "if set, truncate all hits after this value is reached", 2 },
/* Control of scoring system */
{ "--popen", eslARG_REAL, "0.02", NULL, "0<=x<0.5",NULL, NULL, NULL, "gap open probability", 3 },
{ "--pextend", eslARG_REAL, "0.4", NULL, "0<=x<1", NULL, NULL, NULL, "gap extend probability", 3 },
@@ -298,6 +299,7 @@ output_header(FILE *ofp, ESL_GETOPTS *go
if (esl_opt_IsUsed(go, "--noali") && fprintf(ofp, "# show alignments in output: no\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
if (esl_opt_IsUsed(go, "--notextw") && fprintf(ofp, "# max ASCII text line length: unlimited\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
if (esl_opt_IsUsed(go, "--textw") && fprintf(ofp, "# max ASCII text line length: %d\n", esl_opt_GetInteger(go, "--textw")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
+ if (esl_opt_IsUsed(go, "--seq_limit") && fprintf(ofp, "# set max sequence hits to return: %d\n", esl_opt_GetInteger(go, "--seq_limit")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
if (esl_opt_IsUsed(go, "--popen") && fprintf(ofp, "# gap open probability: %f\n", esl_opt_GetReal (go, "--popen")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
if (esl_opt_IsUsed(go, "--pextend") && fprintf(ofp, "# gap extend probability: %f\n", esl_opt_GetReal (go, "--pextend")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
if (esl_opt_IsUsed(go, "--mx") && fprintf(ofp, "# subst score matrix (built-in): %s\n", esl_opt_GetString (go, "--mx")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
@@ -674,6 +676,13 @@ serial_master(ESL_GETOPTS *go, struct cf
/* Print the results. */
p7_tophits_SortBySortkey(info->th);
p7_tophits_Threshold(info->th, info->pli);
+ /* Limit the number of hits if specified. */
+ if (esl_opt_IsOn(go, "--seq_limit"))
+ {
+ int seq_limit = esl_opt_GetInteger(go, "--seq_limit");
+ info->th->N = ESL_MIN(info->th->N, seq_limit);
+ }
+
p7_tophits_CompareRanking(info->th, kh, &nnew_targets);
p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");

View File

@ -73,6 +73,13 @@ class Jackhmmer(msa_tool.MsaTool):
self.filter_f2 = filter_f2
self.filter_f3 = filter_f3
# If Jackhmmer supports the --seq_limit flag (via our patch), use it to
# prevent writing out redundant sequences and increasing peak memory usage.
# If not, the Jackhmmer will be run without the --seq_limit flag.
self.supports_seq_limit = subprocess_utils.jackhmmer_seq_limit_supported(
self.binary_path
)
def query(self, target_sequence: str) -> msa_tool.MsaToolResult:
"""Queries the database using Jackhmmer."""
logging.info(
@ -116,6 +123,9 @@ class Jackhmmer(msa_tool.MsaTool):
if self.z_value is not None:
cmd_flags.extend(['-Z', str(self.z_value)])
if self.max_sequences is not None and self.supports_seq_limit:
cmd_flags.extend(['--seq_limit', str(self.max_sequences)])
cmd = (
[self.binary_path]
+ cmd_flags

View File

@ -36,6 +36,15 @@ def check_binary_exists(path: str, name: str) -> None:
raise RuntimeError(f'{name} binary not found at {path}')
def jackhmmer_seq_limit_supported(jackhmmer_path: str) -> bool:
"""Checks if Jackhmmer supports the --seq-limit flag."""
try:
subprocess.run([jackhmmer_path, '-h', '--seq_limit', '1'], check=True)
except subprocess.CalledProcessError:
return False
return True
def run(
cmd: Sequence[str],
cmd_name: str,