biopython/Bio/Data/IUPACData.py

# Copyright 2000 Andrew Dalke.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Information about the IUPAC alphabets."""

protein_letters = "ACDEFGHIKLMNPQRSTVWY"
extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
#   B = "Asx";  aspartic acid or asparagine (D or N)
#   X = "Xxx";  unknown or 'other' amino acid
#   Z = "Glx";  glutamic acid or glutamine (E or Q)
#   http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212
#
#   J = "Xle";  leucine or isoleucine (L or I, used in NMR)
#   Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
#   Also the International Nucleotide Sequence Database Collaboration (INSDC)
#   (i.e. GenBank, EMBL, DDBJ) adopted this in 2006
#   http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html
#
#   Xle (J); Leucine or Isoleucine
#   The residue abbreviations, Xle (the three-letter abbreviation) and J
#   (the one-letter abbreviation) are reserved for the case that cannot
#   experimentally distinguish leucine from isoleucine.
#
#   U = "Sec";  selenocysteine
#   http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
#
#   O = "Pyl";  pyrrolysine
#   http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35

protein_letters_1to3 = {
    "A": "Ala",
    "C": "Cys",
    "D": "Asp",
    "E": "Glu",
    "F": "Phe",
    "G": "Gly",
    "H": "His",
    "I": "Ile",
    "K": "Lys",
    "L": "Leu",
    "M": "Met",
    "N": "Asn",
    "P": "Pro",
    "Q": "Gln",
    "R": "Arg",
    "S": "Ser",
    "T": "Thr",
    "V": "Val",
    "W": "Trp",
    "Y": "Tyr",
}
protein_letters_1to3_extended = {
    **protein_letters_1to3,
    **{"B": "Asx", "X": "Xaa", "Z": "Glx", "J": "Xle", "U": "Sec", "O": "Pyl"},
}

protein_letters_3to1 = {value: key for key, value in protein_letters_1to3.items()}
protein_letters_3to1_extended = {
    value: key for key, value in protein_letters_1to3_extended.items()
}

ambiguous_dna_letters = "GATCRYWSMKHBVDN"
unambiguous_dna_letters = "GATC"
ambiguous_rna_letters = "GAUCRYWSMKHBVDN"
unambiguous_rna_letters = "GAUC"

#   B == 5-bromouridine
#   D == 5,6-dihydrouridine
#   S == thiouridine
#   W == wyosine
extended_dna_letters = "GATCBDSW"

# are there extended forms?
# extended_rna_letters = "GAUCBDSW"

# "X" is included in the following _values and _complement dictionaries,
# for historical reasons although it is not an IUPAC nucleotide,
# and so is not in the corresponding _letters strings above
ambiguous_dna_values = {
    "A": "A",
    "C": "C",
    "G": "G",
    "T": "T",
    "M": "AC",
    "R": "AG",
    "W": "AT",
    "S": "CG",
    "Y": "CT",
    "K": "GT",
    "V": "ACG",
    "H": "ACT",
    "D": "AGT",
    "B": "CGT",
    "X": "GATC",
    "N": "GATC",
}
ambiguous_rna_values = {
    "A": "A",
    "C": "C",
    "G": "G",
    "U": "U",
    "M": "AC",
    "R": "AG",
    "W": "AU",
    "S": "CG",
    "Y": "CU",
    "K": "GU",
    "V": "ACG",
    "H": "ACU",
    "D": "AGU",
    "B": "CGU",
    "X": "GAUC",
    "N": "GAUC",
}

ambiguous_dna_complement = {
    "A": "T",
    "C": "G",
    "G": "C",
    "T": "A",
    "M": "K",
    "R": "Y",
    "W": "W",
    "S": "S",
    "Y": "R",
    "K": "M",
    "V": "B",
    "H": "D",
    "D": "H",
    "B": "V",
    "X": "X",
    "N": "N",
}

ambiguous_rna_complement = {
    "A": "U",
    "C": "G",
    "G": "C",
    "U": "A",
    "M": "K",
    "R": "Y",
    "W": "W",
    "S": "S",
    "Y": "R",
    "K": "M",
    "V": "B",
    "H": "D",
    "D": "H",
    "B": "V",
    "X": "X",
    "N": "N",
}


def _make_ranges(mydict):
    d = {}
    for key, value in mydict.items():
        d[key] = (value, value)
    return d


# Mass data taken from PubChem


# Average masses of monophosphate deoxy nucleotides
unambiguous_dna_weights = {"A": 331.2218, "C": 307.1971, "G": 347.2212, "T": 322.2085}

# Monoisotopic masses of monophospate deoxy nucleotides
monoisotopic_unambiguous_dna_weights = {
    "A": 331.06817,
    "C": 307.056936,
    "G": 347.063084,
    "T": 322.056602,
}

unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights)

unambiguous_rna_weights = {"A": 347.2212, "C": 323.1965, "G": 363.2206, "U": 324.1813}

monoisotopic_unambiguous_rna_weights = {
    "A": 347.063084,
    "C": 323.051851,
    "G": 363.057999,
    "U": 324.035867,
}

unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights)


def _make_ambiguous_ranges(mydict, weight_table):
    range_d = {}
    avg_d = {}
    for letter, values in mydict.items():
        # Following line is a quick hack to skip undefined weights for U and O
        if len(values) == 1 and values[0] not in weight_table:
            continue

        weights = [weight_table.get(x) for x in values]
        range_d[letter] = (min(weights), max(weights))
        total_w = 0.0
        for w in weights:
            total_w = total_w + w
        avg_d[letter] = total_w / len(weights)
    return range_d, avg_d


ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = _make_ambiguous_ranges(
    ambiguous_dna_values, unambiguous_dna_weights
)

ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = _make_ambiguous_ranges(
    ambiguous_rna_values, unambiguous_rna_weights
)

protein_weights = {
    "A": 89.0932,
    "C": 121.1582,
    "D": 133.1027,
    "E": 147.1293,
    "F": 165.1891,
    "G": 75.0666,
    "H": 155.1546,
    "I": 131.1729,
    "K": 146.1876,
    "L": 131.1729,
    "M": 149.2113,
    "N": 132.1179,
    "O": 255.3134,
    "P": 115.1305,
    "Q": 146.1445,
    "R": 174.201,
    "S": 105.0926,
    "T": 119.1192,
    "U": 168.0532,
    "V": 117.1463,
    "W": 204.2252,
    "Y": 181.1885,
}

monoisotopic_protein_weights = {
    "A": 89.047678,
    "C": 121.019749,
    "D": 133.037508,
    "E": 147.053158,
    "F": 165.078979,
    "G": 75.032028,
    "H": 155.069477,
    "I": 131.094629,
    "K": 146.105528,
    "L": 131.094629,
    "M": 149.051049,
    "N": 132.053492,
    "O": 255.158292,
    "P": 115.063329,
    "Q": 146.069142,
    "R": 174.111676,
    "S": 105.042593,
    "T": 119.058243,
    "U": 168.964203,
    "V": 117.078979,
    "W": 204.089878,
    "Y": 181.073893,
}

extended_protein_values = {
    "A": "A",
    "B": "ND",
    "C": "C",
    "D": "D",
    "E": "E",
    "F": "F",
    "G": "G",
    "H": "H",
    "I": "I",
    "J": "IL",
    "K": "K",
    "L": "L",
    "M": "M",
    "N": "N",
    "O": "O",
    "P": "P",
    "Q": "Q",
    "R": "R",
    "S": "S",
    "T": "T",
    "U": "U",
    "V": "V",
    "W": "W",
    "X": "ACDEFGHIKLMNPQRSTVWY",
    # TODO - Include U and O in the possible values of X?
    # This could alter the extended_protein_weight_ranges ...
    # by MP: Won't do this, because they are so rare.
    "Y": "Y",
    "Z": "QE",
}

protein_weight_ranges = _make_ranges(protein_weights)

extended_protein_weight_ranges, avg_extended_protein_weights = _make_ambiguous_ranges(
    extended_protein_values, protein_weights
)


# For Center of Mass Calculation.
# Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol
atom_weights = {
    "H": 1.00794,
    "D": 2.01410,
    "He": 4.002602,
    "Li": 6.941,
    "Be": 9.012182,
    "B": 10.811,
    "C": 12.0107,
    "N": 14.0067,
    "O": 15.9994,
    "F": 18.9984032,
    "Ne": 20.1797,
    "Na": 22.989770,
    "Mg": 24.3050,
    "Al": 26.981538,
    "Si": 28.0855,
    "P": 30.973761,
    "S": 32.065,
    "Cl": 35.453,
    "Ar": 39.948,
    "K": 39.0983,
    "Ca": 40.078,
    "Sc": 44.955910,
    "Ti": 47.867,
    "V": 50.9415,
    "Cr": 51.9961,
    "Mn": 54.938049,
    "Fe": 55.845,
    "Co": 58.933200,
    "Ni": 58.6934,
    "Cu": 63.546,
    "Zn": 65.39,
    "Ga": 69.723,
    "Ge": 72.64,
    "As": 74.92160,
    "Se": 78.96,
    "Br": 79.904,
    "Kr": 83.80,
    "Rb": 85.4678,
    "Sr": 87.62,
    "Y": 88.90585,
    "Zr": 91.224,
    "Nb": 92.90638,
    "Mo": 95.94,
    "Tc": 98.0,
    "Ru": 101.07,
    "Rh": 102.90550,
    "Pd": 106.42,
    "Ag": 107.8682,
    "Cd": 112.411,
    "In": 114.818,
    "Sn": 118.710,
    "Sb": 121.760,
    "Te": 127.60,
    "I": 126.90447,
    "Xe": 131.293,
    "Cs": 132.90545,
    "Ba": 137.327,
    "La": 138.9055,
    "Ce": 140.116,
    "Pr": 140.90765,
    "Nd": 144.24,
    "Pm": 145.0,
    "Sm": 150.36,
    "Eu": 151.964,
    "Gd": 157.25,
    "Tb": 158.92534,
    "Dy": 162.50,
    "Ho": 164.93032,
    "Er": 167.259,
    "Tm": 168.93421,
    "Yb": 173.04,
    "Lu": 174.967,
    "Hf": 178.49,
    "Ta": 180.9479,
    "W": 183.84,
    "Re": 186.207,
    "Os": 190.23,
    "Ir": 192.217,
    "Pt": 195.078,
    "Au": 196.96655,
    "Hg": 200.59,
    "Tl": 204.3833,
    "Pb": 207.2,
    "Bi": 208.98038,
    "Po": 208.98,
    "At": 209.99,
    "Rn": 222.02,
    "Fr": 223.02,
    "Ra": 226.03,
    "Ac": 227.03,
    "Th": 232.0381,
    "Pa": 231.03588,
    "U": 238.02891,
    "Np": 237.05,
    "Pu": 244.06,
    "Am": 243.06,
    "Cm": 247.07,
    "Bk": 247.07,
    "Cf": 251.08,
    "Es": 252.08,
    "Fm": 257.10,
    "Md": 258.10,
    "No": 259.10,
    "Lr": 262.11,
    "Rf": 261.11,
    "Db": 262.11,
    "Sg": 266.12,
    "Bh": 264.12,
    "Hs": 269.13,
    "Mt": 268.14,
}