From 751a4b8612d0d53de8f6e1830c8f726e873a55cf Mon Sep 17 00:00:00 2001 From: Augustin Zidek Date: Mon, 14 Jul 2025 01:59:30 -0700 Subject: [PATCH] Add support for --seq_limit in Jackhmmer - significantly reduces peak RAM use * See https://github.com/EddyRivasLab/hmmer/issues/323 for more context. * We observed 75.6 GB -> 13.6 GB peak RAM reduction for one of our queries. PiperOrigin-RevId: 782808687 Change-Id: I4306dc6921015c88c5f8ced69a4ef46e10574a57 --- docker/Dockerfile | 19 ++++++++--- docker/jackhmmer_seq_limit.patch | 32 +++++++++++++++++++ src/alphafold3/data/tools/jackhmmer.py | 10 ++++++ src/alphafold3/data/tools/subprocess_utils.py | 9 ++++++ 4 files changed, 65 insertions(+), 5 deletions(-) create mode 100644 docker/jackhmmer_seq_limit.patch diff --git a/docker/Dockerfile b/docker/Dockerfile index 8231187..cf3eb68 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -31,14 +31,23 @@ RUN pip3 install --no-cache-dir --upgrade pip # Install HMMER. Do so before copying the source code, so that docker can cache # the image layer containing HMMER. Alternatively, you could also install it # using `apt-get install hmmer` instead of bulding it from source, but we want -# to have control over the exact version of HMMER. Also note that eddylab.org -# unfortunately doesn't support HTTPS and the tar file published on GitHub is -# explicitly not recommended to be used for building from source. +# to have control over the exact version of HMMER and also apply the sequence +# limit patch. Also note that eddylab.org unfortunately doesn't support HTTPS +# and the tar file published on GitHub is explicitly not recommended to be used +# for building from source. + +# Download, check hash, and extract the HMMER source code. RUN mkdir /hmmer_build /hmmer ; \ wget http://eddylab.org/software/hmmer/hmmer-3.4.tar.gz --directory-prefix /hmmer_build ; \ (cd /hmmer_build && echo "ca70d94fd0cf271bd7063423aabb116d42de533117343a9b27a65c17ff06fbf3 hmmer-3.4.tar.gz" | sha256sum --check) && \ - (cd /hmmer_build && tar zxf hmmer-3.4.tar.gz && rm hmmer-3.4.tar.gz) ; \ - (cd /hmmer_build/hmmer-3.4 && ./configure --prefix /hmmer) ; \ + (cd /hmmer_build && tar zxf hmmer-3.4.tar.gz && rm hmmer-3.4.tar.gz) + +# Apply the --seq_limit patch to HMMER. +COPY docker/jackhmmer_seq_limit.patch /hmmer_build/ +RUN (cd /hmmer_build && patch -p0 < jackhmmer_seq_limit.patch) + +# Build HMMER. +RUN (cd /hmmer_build/hmmer-3.4 && ./configure --prefix /hmmer) ; \ (cd /hmmer_build/hmmer-3.4 && make -j) ; \ (cd /hmmer_build/hmmer-3.4 && make install) ; \ (cd /hmmer_build/hmmer-3.4/easel && make install) ; \ diff --git a/docker/jackhmmer_seq_limit.patch b/docker/jackhmmer_seq_limit.patch new file mode 100644 index 0000000..9249a1f --- /dev/null +++ b/docker/jackhmmer_seq_limit.patch @@ -0,0 +1,32 @@ +--- hmmer-3.4/src/jackhmmer.c ++++ hmmer-3.4/src/jackhmmer.c +@@ -73,6 +73,7 @@ static ESL_OPTIONS options[] = { + { "--noali", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "don't output alignments, so output is smaller", 2 }, + { "--notextw", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, "--textw", "unlimit ASCII text output line width", 2 }, + { "--textw", eslARG_INT, "120", NULL, "n>=120", NULL, NULL, "--notextw", "set max width of ASCII text output lines", 2 }, ++ { "--seq_limit", eslARG_INT, NULL, NULL, NULL, NULL, NULL, "--seq_limit", "if set, truncate all hits after this value is reached", 2 }, + /* Control of scoring system */ + { "--popen", eslARG_REAL, "0.02", NULL, "0<=x<0.5",NULL, NULL, NULL, "gap open probability", 3 }, + { "--pextend", eslARG_REAL, "0.4", NULL, "0<=x<1", NULL, NULL, NULL, "gap extend probability", 3 }, +@@ -298,6 +299,7 @@ output_header(FILE *ofp, ESL_GETOPTS *go + if (esl_opt_IsUsed(go, "--noali") && fprintf(ofp, "# show alignments in output: no\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); + if (esl_opt_IsUsed(go, "--notextw") && fprintf(ofp, "# max ASCII text line length: unlimited\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); + if (esl_opt_IsUsed(go, "--textw") && fprintf(ofp, "# max ASCII text line length: %d\n", esl_opt_GetInteger(go, "--textw")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); ++ if (esl_opt_IsUsed(go, "--seq_limit") && fprintf(ofp, "# set max sequence hits to return: %d\n", esl_opt_GetInteger(go, "--seq_limit")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); + if (esl_opt_IsUsed(go, "--popen") && fprintf(ofp, "# gap open probability: %f\n", esl_opt_GetReal (go, "--popen")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); + if (esl_opt_IsUsed(go, "--pextend") && fprintf(ofp, "# gap extend probability: %f\n", esl_opt_GetReal (go, "--pextend")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); + if (esl_opt_IsUsed(go, "--mx") && fprintf(ofp, "# subst score matrix (built-in): %s\n", esl_opt_GetString (go, "--mx")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); +@@ -674,6 +676,13 @@ serial_master(ESL_GETOPTS *go, struct cf + /* Print the results. */ + p7_tophits_SortBySortkey(info->th); + p7_tophits_Threshold(info->th, info->pli); ++ /* Limit the number of hits if specified. */ ++ if (esl_opt_IsOn(go, "--seq_limit")) ++ { ++ int seq_limit = esl_opt_GetInteger(go, "--seq_limit"); ++ info->th->N = ESL_MIN(info->th->N, seq_limit); ++ } ++ + p7_tophits_CompareRanking(info->th, kh, &nnew_targets); + p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); + p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed"); diff --git a/src/alphafold3/data/tools/jackhmmer.py b/src/alphafold3/data/tools/jackhmmer.py index 7d80417..09f4fdc 100644 --- a/src/alphafold3/data/tools/jackhmmer.py +++ b/src/alphafold3/data/tools/jackhmmer.py @@ -73,6 +73,13 @@ class Jackhmmer(msa_tool.MsaTool): self.filter_f2 = filter_f2 self.filter_f3 = filter_f3 + # If Jackhmmer supports the --seq_limit flag (via our patch), use it to + # prevent writing out redundant sequences and increasing peak memory usage. + # If not, the Jackhmmer will be run without the --seq_limit flag. + self.supports_seq_limit = subprocess_utils.jackhmmer_seq_limit_supported( + self.binary_path + ) + def query(self, target_sequence: str) -> msa_tool.MsaToolResult: """Queries the database using Jackhmmer.""" logging.info( @@ -116,6 +123,9 @@ class Jackhmmer(msa_tool.MsaTool): if self.z_value is not None: cmd_flags.extend(['-Z', str(self.z_value)]) + if self.max_sequences is not None and self.supports_seq_limit: + cmd_flags.extend(['--seq_limit', str(self.max_sequences)]) + cmd = ( [self.binary_path] + cmd_flags diff --git a/src/alphafold3/data/tools/subprocess_utils.py b/src/alphafold3/data/tools/subprocess_utils.py index a736d7a..48387eb 100644 --- a/src/alphafold3/data/tools/subprocess_utils.py +++ b/src/alphafold3/data/tools/subprocess_utils.py @@ -36,6 +36,15 @@ def check_binary_exists(path: str, name: str) -> None: raise RuntimeError(f'{name} binary not found at {path}') +def jackhmmer_seq_limit_supported(jackhmmer_path: str) -> bool: + """Checks if Jackhmmer supports the --seq-limit flag.""" + try: + subprocess.run([jackhmmer_path, '-h', '--seq_limit', '1'], check=True) + except subprocess.CalledProcessError: + return False + return True + + def run( cmd: Sequence[str], cmd_name: str,