mirror of
https://github.com/google-deepmind/alphafold3.git
synced 2025-10-20 13:23:47 +08:00
Add support for --seq_limit in Jackhmmer - significantly reduces peak RAM use
* See https://github.com/EddyRivasLab/hmmer/issues/323 for more context. * We observed 75.6 GB -> 13.6 GB peak RAM reduction for one of our queries. PiperOrigin-RevId: 782808687 Change-Id: I4306dc6921015c88c5f8ced69a4ef46e10574a57
This commit is contained in:
committed by
Copybara-Service
parent
a209008f6f
commit
751a4b8612
@ -31,14 +31,23 @@ RUN pip3 install --no-cache-dir --upgrade pip
|
||||
# Install HMMER. Do so before copying the source code, so that docker can cache
|
||||
# the image layer containing HMMER. Alternatively, you could also install it
|
||||
# using `apt-get install hmmer` instead of bulding it from source, but we want
|
||||
# to have control over the exact version of HMMER. Also note that eddylab.org
|
||||
# unfortunately doesn't support HTTPS and the tar file published on GitHub is
|
||||
# explicitly not recommended to be used for building from source.
|
||||
# to have control over the exact version of HMMER and also apply the sequence
|
||||
# limit patch. Also note that eddylab.org unfortunately doesn't support HTTPS
|
||||
# and the tar file published on GitHub is explicitly not recommended to be used
|
||||
# for building from source.
|
||||
|
||||
# Download, check hash, and extract the HMMER source code.
|
||||
RUN mkdir /hmmer_build /hmmer ; \
|
||||
wget http://eddylab.org/software/hmmer/hmmer-3.4.tar.gz --directory-prefix /hmmer_build ; \
|
||||
(cd /hmmer_build && echo "ca70d94fd0cf271bd7063423aabb116d42de533117343a9b27a65c17ff06fbf3 hmmer-3.4.tar.gz" | sha256sum --check) && \
|
||||
(cd /hmmer_build && tar zxf hmmer-3.4.tar.gz && rm hmmer-3.4.tar.gz) ; \
|
||||
(cd /hmmer_build/hmmer-3.4 && ./configure --prefix /hmmer) ; \
|
||||
(cd /hmmer_build && tar zxf hmmer-3.4.tar.gz && rm hmmer-3.4.tar.gz)
|
||||
|
||||
# Apply the --seq_limit patch to HMMER.
|
||||
COPY docker/jackhmmer_seq_limit.patch /hmmer_build/
|
||||
RUN (cd /hmmer_build && patch -p0 < jackhmmer_seq_limit.patch)
|
||||
|
||||
# Build HMMER.
|
||||
RUN (cd /hmmer_build/hmmer-3.4 && ./configure --prefix /hmmer) ; \
|
||||
(cd /hmmer_build/hmmer-3.4 && make -j) ; \
|
||||
(cd /hmmer_build/hmmer-3.4 && make install) ; \
|
||||
(cd /hmmer_build/hmmer-3.4/easel && make install) ; \
|
||||
|
32
docker/jackhmmer_seq_limit.patch
Normal file
32
docker/jackhmmer_seq_limit.patch
Normal file
@ -0,0 +1,32 @@
|
||||
--- hmmer-3.4/src/jackhmmer.c
|
||||
+++ hmmer-3.4/src/jackhmmer.c
|
||||
@@ -73,6 +73,7 @@ static ESL_OPTIONS options[] = {
|
||||
{ "--noali", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "don't output alignments, so output is smaller", 2 },
|
||||
{ "--notextw", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, "--textw", "unlimit ASCII text output line width", 2 },
|
||||
{ "--textw", eslARG_INT, "120", NULL, "n>=120", NULL, NULL, "--notextw", "set max width of ASCII text output lines", 2 },
|
||||
+ { "--seq_limit", eslARG_INT, NULL, NULL, NULL, NULL, NULL, "--seq_limit", "if set, truncate all hits after this value is reached", 2 },
|
||||
/* Control of scoring system */
|
||||
{ "--popen", eslARG_REAL, "0.02", NULL, "0<=x<0.5",NULL, NULL, NULL, "gap open probability", 3 },
|
||||
{ "--pextend", eslARG_REAL, "0.4", NULL, "0<=x<1", NULL, NULL, NULL, "gap extend probability", 3 },
|
||||
@@ -298,6 +299,7 @@ output_header(FILE *ofp, ESL_GETOPTS *go
|
||||
if (esl_opt_IsUsed(go, "--noali") && fprintf(ofp, "# show alignments in output: no\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
|
||||
if (esl_opt_IsUsed(go, "--notextw") && fprintf(ofp, "# max ASCII text line length: unlimited\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
|
||||
if (esl_opt_IsUsed(go, "--textw") && fprintf(ofp, "# max ASCII text line length: %d\n", esl_opt_GetInteger(go, "--textw")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
|
||||
+ if (esl_opt_IsUsed(go, "--seq_limit") && fprintf(ofp, "# set max sequence hits to return: %d\n", esl_opt_GetInteger(go, "--seq_limit")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
|
||||
if (esl_opt_IsUsed(go, "--popen") && fprintf(ofp, "# gap open probability: %f\n", esl_opt_GetReal (go, "--popen")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
|
||||
if (esl_opt_IsUsed(go, "--pextend") && fprintf(ofp, "# gap extend probability: %f\n", esl_opt_GetReal (go, "--pextend")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
|
||||
if (esl_opt_IsUsed(go, "--mx") && fprintf(ofp, "# subst score matrix (built-in): %s\n", esl_opt_GetString (go, "--mx")) < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
|
||||
@@ -674,6 +676,13 @@ serial_master(ESL_GETOPTS *go, struct cf
|
||||
/* Print the results. */
|
||||
p7_tophits_SortBySortkey(info->th);
|
||||
p7_tophits_Threshold(info->th, info->pli);
|
||||
+ /* Limit the number of hits if specified. */
|
||||
+ if (esl_opt_IsOn(go, "--seq_limit"))
|
||||
+ {
|
||||
+ int seq_limit = esl_opt_GetInteger(go, "--seq_limit");
|
||||
+ info->th->N = ESL_MIN(info->th->N, seq_limit);
|
||||
+ }
|
||||
+
|
||||
p7_tophits_CompareRanking(info->th, kh, &nnew_targets);
|
||||
p7_tophits_Targets(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
|
||||
p7_tophits_Domains(ofp, info->th, info->pli, textw); if (fprintf(ofp, "\n\n") < 0) ESL_EXCEPTION_SYS(eslEWRITE, "write failed");
|
@ -73,6 +73,13 @@ class Jackhmmer(msa_tool.MsaTool):
|
||||
self.filter_f2 = filter_f2
|
||||
self.filter_f3 = filter_f3
|
||||
|
||||
# If Jackhmmer supports the --seq_limit flag (via our patch), use it to
|
||||
# prevent writing out redundant sequences and increasing peak memory usage.
|
||||
# If not, the Jackhmmer will be run without the --seq_limit flag.
|
||||
self.supports_seq_limit = subprocess_utils.jackhmmer_seq_limit_supported(
|
||||
self.binary_path
|
||||
)
|
||||
|
||||
def query(self, target_sequence: str) -> msa_tool.MsaToolResult:
|
||||
"""Queries the database using Jackhmmer."""
|
||||
logging.info(
|
||||
@ -116,6 +123,9 @@ class Jackhmmer(msa_tool.MsaTool):
|
||||
if self.z_value is not None:
|
||||
cmd_flags.extend(['-Z', str(self.z_value)])
|
||||
|
||||
if self.max_sequences is not None and self.supports_seq_limit:
|
||||
cmd_flags.extend(['--seq_limit', str(self.max_sequences)])
|
||||
|
||||
cmd = (
|
||||
[self.binary_path]
|
||||
+ cmd_flags
|
||||
|
@ -36,6 +36,15 @@ def check_binary_exists(path: str, name: str) -> None:
|
||||
raise RuntimeError(f'{name} binary not found at {path}')
|
||||
|
||||
|
||||
def jackhmmer_seq_limit_supported(jackhmmer_path: str) -> bool:
|
||||
"""Checks if Jackhmmer supports the --seq-limit flag."""
|
||||
try:
|
||||
subprocess.run([jackhmmer_path, '-h', '--seq_limit', '1'], check=True)
|
||||
except subprocess.CalledProcessError:
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def run(
|
||||
cmd: Sequence[str],
|
||||
cmd_name: str,
|
||||
|
Reference in New Issue
Block a user