Files
biopython/Bio/Data/IUPACData.py
Peter Cock 57056bbfc4 ruff format v0.3.1, black v24.2.0
Reverted unwanted changes (mostly arrays in tests, but also
some whitespace in doctests).

Remaining changes are standardising spacing between module
docstring and imports, and lower-case \x<hex> in strings.
2024-03-14 11:05:43 +00:00

421 lines
9.0 KiB
Python

# Copyright 2000 Andrew Dalke. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Information about the IUPAC alphabets."""
protein_letters = "ACDEFGHIKLMNPQRSTVWY"
extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
# B = "Asx"; aspartic acid or asparagine (D or N)
# X = "Xxx"; unknown or 'other' amino acid
# Z = "Glx"; glutamic acid or glutamine (E or Q)
# http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212
#
# J = "Xle"; leucine or isoleucine (L or I, used in NMR)
# Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
# Also the International Nucleotide Sequence Database Collaboration (INSDC)
# (i.e. GenBank, EMBL, DDBJ) adopted this in 2006
# http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html
#
# Xle (J); Leucine or Isoleucine
# The residue abbreviations, Xle (the three-letter abbreviation) and J
# (the one-letter abbreviation) are reserved for the case that cannot
# experimentally distinguish leucine from isoleucine.
#
# U = "Sec"; selenocysteine
# http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
#
# O = "Pyl"; pyrrolysine
# http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35
protein_letters_1to3 = {
"A": "Ala",
"C": "Cys",
"D": "Asp",
"E": "Glu",
"F": "Phe",
"G": "Gly",
"H": "His",
"I": "Ile",
"K": "Lys",
"L": "Leu",
"M": "Met",
"N": "Asn",
"P": "Pro",
"Q": "Gln",
"R": "Arg",
"S": "Ser",
"T": "Thr",
"V": "Val",
"W": "Trp",
"Y": "Tyr",
}
protein_letters_1to3_extended = {
**protein_letters_1to3,
**{"B": "Asx", "X": "Xaa", "Z": "Glx", "J": "Xle", "U": "Sec", "O": "Pyl"},
}
protein_letters_3to1 = {value: key for key, value in protein_letters_1to3.items()}
protein_letters_3to1_extended = {
value: key for key, value in protein_letters_1to3_extended.items()
}
ambiguous_dna_letters = "GATCRYWSMKHBVDN"
unambiguous_dna_letters = "GATC"
ambiguous_rna_letters = "GAUCRYWSMKHBVDN"
unambiguous_rna_letters = "GAUC"
# B == 5-bromouridine
# D == 5,6-dihydrouridine
# S == thiouridine
# W == wyosine
extended_dna_letters = "GATCBDSW"
# are there extended forms?
# extended_rna_letters = "GAUCBDSW"
# "X" is included in the following _values and _complement dictionaries,
# for historical reasons although it is not an IUPAC nucleotide,
# and so is not in the corresponding _letters strings above
ambiguous_dna_values = {
"A": "A",
"C": "C",
"G": "G",
"T": "T",
"M": "AC",
"R": "AG",
"W": "AT",
"S": "CG",
"Y": "CT",
"K": "GT",
"V": "ACG",
"H": "ACT",
"D": "AGT",
"B": "CGT",
"X": "GATC",
"N": "GATC",
}
ambiguous_rna_values = {
"A": "A",
"C": "C",
"G": "G",
"U": "U",
"M": "AC",
"R": "AG",
"W": "AU",
"S": "CG",
"Y": "CU",
"K": "GU",
"V": "ACG",
"H": "ACU",
"D": "AGU",
"B": "CGU",
"X": "GAUC",
"N": "GAUC",
}
ambiguous_dna_complement = {
"A": "T",
"C": "G",
"G": "C",
"T": "A",
"M": "K",
"R": "Y",
"W": "W",
"S": "S",
"Y": "R",
"K": "M",
"V": "B",
"H": "D",
"D": "H",
"B": "V",
"X": "X",
"N": "N",
}
ambiguous_rna_complement = {
"A": "U",
"C": "G",
"G": "C",
"U": "A",
"M": "K",
"R": "Y",
"W": "W",
"S": "S",
"Y": "R",
"K": "M",
"V": "B",
"H": "D",
"D": "H",
"B": "V",
"X": "X",
"N": "N",
}
def _make_ranges(mydict):
d = {}
for key, value in mydict.items():
d[key] = (value, value)
return d
# Mass data taken from PubChem
# Average masses of monophosphate deoxy nucleotides
unambiguous_dna_weights = {"A": 331.2218, "C": 307.1971, "G": 347.2212, "T": 322.2085}
# Monoisotopic masses of monophospate deoxy nucleotides
monoisotopic_unambiguous_dna_weights = {
"A": 331.06817,
"C": 307.056936,
"G": 347.063084,
"T": 322.056602,
}
unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights)
unambiguous_rna_weights = {"A": 347.2212, "C": 323.1965, "G": 363.2206, "U": 324.1813}
monoisotopic_unambiguous_rna_weights = {
"A": 347.063084,
"C": 323.051851,
"G": 363.057999,
"U": 324.035867,
}
unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights)
def _make_ambiguous_ranges(mydict, weight_table):
range_d = {}
avg_d = {}
for letter, values in mydict.items():
# Following line is a quick hack to skip undefined weights for U and O
if len(values) == 1 and values[0] not in weight_table:
continue
weights = [weight_table.get(x) for x in values]
range_d[letter] = (min(weights), max(weights))
total_w = 0.0
for w in weights:
total_w = total_w + w
avg_d[letter] = total_w / len(weights)
return range_d, avg_d
ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = _make_ambiguous_ranges(
ambiguous_dna_values, unambiguous_dna_weights
)
ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = _make_ambiguous_ranges(
ambiguous_rna_values, unambiguous_rna_weights
)
protein_weights = {
"A": 89.0932,
"C": 121.1582,
"D": 133.1027,
"E": 147.1293,
"F": 165.1891,
"G": 75.0666,
"H": 155.1546,
"I": 131.1729,
"K": 146.1876,
"L": 131.1729,
"M": 149.2113,
"N": 132.1179,
"O": 255.3134,
"P": 115.1305,
"Q": 146.1445,
"R": 174.201,
"S": 105.0926,
"T": 119.1192,
"U": 168.0532,
"V": 117.1463,
"W": 204.2252,
"Y": 181.1885,
}
monoisotopic_protein_weights = {
"A": 89.047678,
"C": 121.019749,
"D": 133.037508,
"E": 147.053158,
"F": 165.078979,
"G": 75.032028,
"H": 155.069477,
"I": 131.094629,
"K": 146.105528,
"L": 131.094629,
"M": 149.051049,
"N": 132.053492,
"O": 255.158292,
"P": 115.063329,
"Q": 146.069142,
"R": 174.111676,
"S": 105.042593,
"T": 119.058243,
"U": 168.964203,
"V": 117.078979,
"W": 204.089878,
"Y": 181.073893,
}
extended_protein_values = {
"A": "A",
"B": "ND",
"C": "C",
"D": "D",
"E": "E",
"F": "F",
"G": "G",
"H": "H",
"I": "I",
"J": "IL",
"K": "K",
"L": "L",
"M": "M",
"N": "N",
"O": "O",
"P": "P",
"Q": "Q",
"R": "R",
"S": "S",
"T": "T",
"U": "U",
"V": "V",
"W": "W",
"X": "ACDEFGHIKLMNPQRSTVWY",
# TODO - Include U and O in the possible values of X?
# This could alter the extended_protein_weight_ranges ...
# by MP: Won't do this, because they are so rare.
"Y": "Y",
"Z": "QE",
}
protein_weight_ranges = _make_ranges(protein_weights)
extended_protein_weight_ranges, avg_extended_protein_weights = _make_ambiguous_ranges(
extended_protein_values, protein_weights
)
# For Center of Mass Calculation.
# Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol
atom_weights = {
"H": 1.00794,
"D": 2.01410,
"He": 4.002602,
"Li": 6.941,
"Be": 9.012182,
"B": 10.811,
"C": 12.0107,
"N": 14.0067,
"O": 15.9994,
"F": 18.9984032,
"Ne": 20.1797,
"Na": 22.989770,
"Mg": 24.3050,
"Al": 26.981538,
"Si": 28.0855,
"P": 30.973761,
"S": 32.065,
"Cl": 35.453,
"Ar": 39.948,
"K": 39.0983,
"Ca": 40.078,
"Sc": 44.955910,
"Ti": 47.867,
"V": 50.9415,
"Cr": 51.9961,
"Mn": 54.938049,
"Fe": 55.845,
"Co": 58.933200,
"Ni": 58.6934,
"Cu": 63.546,
"Zn": 65.39,
"Ga": 69.723,
"Ge": 72.64,
"As": 74.92160,
"Se": 78.96,
"Br": 79.904,
"Kr": 83.80,
"Rb": 85.4678,
"Sr": 87.62,
"Y": 88.90585,
"Zr": 91.224,
"Nb": 92.90638,
"Mo": 95.94,
"Tc": 98.0,
"Ru": 101.07,
"Rh": 102.90550,
"Pd": 106.42,
"Ag": 107.8682,
"Cd": 112.411,
"In": 114.818,
"Sn": 118.710,
"Sb": 121.760,
"Te": 127.60,
"I": 126.90447,
"Xe": 131.293,
"Cs": 132.90545,
"Ba": 137.327,
"La": 138.9055,
"Ce": 140.116,
"Pr": 140.90765,
"Nd": 144.24,
"Pm": 145.0,
"Sm": 150.36,
"Eu": 151.964,
"Gd": 157.25,
"Tb": 158.92534,
"Dy": 162.50,
"Ho": 164.93032,
"Er": 167.259,
"Tm": 168.93421,
"Yb": 173.04,
"Lu": 174.967,
"Hf": 178.49,
"Ta": 180.9479,
"W": 183.84,
"Re": 186.207,
"Os": 190.23,
"Ir": 192.217,
"Pt": 195.078,
"Au": 196.96655,
"Hg": 200.59,
"Tl": 204.3833,
"Pb": 207.2,
"Bi": 208.98038,
"Po": 208.98,
"At": 209.99,
"Rn": 222.02,
"Fr": 223.02,
"Ra": 226.03,
"Ac": 227.03,
"Th": 232.0381,
"Pa": 231.03588,
"U": 238.02891,
"Np": 237.05,
"Pu": 244.06,
"Am": 243.06,
"Cm": 247.07,
"Bk": 247.07,
"Cf": 251.08,
"Es": 252.08,
"Fm": 257.10,
"Md": 258.10,
"No": 259.10,
"Lr": 262.11,
"Rf": 261.11,
"Db": 262.11,
"Sg": 266.12,
"Bh": 264.12,
"Hs": 269.13,
"Mt": 268.14,
}