mirror of
https://github.com/biopython/biopython.git
synced 2025-10-21 06:23:48 +08:00
Reverted unwanted changes (mostly arrays in tests, but also some whitespace in doctests). Remaining changes are standardising spacing between module docstring and imports, and lower-case \x<hex> in strings.
421 lines
9.0 KiB
Python
421 lines
9.0 KiB
Python
# Copyright 2000 Andrew Dalke. All rights reserved.
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
"""Information about the IUPAC alphabets."""
|
|
|
|
protein_letters = "ACDEFGHIKLMNPQRSTVWY"
|
|
extended_protein_letters = "ACDEFGHIKLMNPQRSTVWYBXZJUO"
|
|
# B = "Asx"; aspartic acid or asparagine (D or N)
|
|
# X = "Xxx"; unknown or 'other' amino acid
|
|
# Z = "Glx"; glutamic acid or glutamine (E or Q)
|
|
# http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html#AA212
|
|
#
|
|
# J = "Xle"; leucine or isoleucine (L or I, used in NMR)
|
|
# Mentioned in http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
|
|
# Also the International Nucleotide Sequence Database Collaboration (INSDC)
|
|
# (i.e. GenBank, EMBL, DDBJ) adopted this in 2006
|
|
# http://www.ddbj.nig.ac.jp/insdc/icm2006-e.html
|
|
#
|
|
# Xle (J); Leucine or Isoleucine
|
|
# The residue abbreviations, Xle (the three-letter abbreviation) and J
|
|
# (the one-letter abbreviation) are reserved for the case that cannot
|
|
# experimentally distinguish leucine from isoleucine.
|
|
#
|
|
# U = "Sec"; selenocysteine
|
|
# http://www.chem.qmul.ac.uk/iubmb/newsletter/1999/item3.html
|
|
#
|
|
# O = "Pyl"; pyrrolysine
|
|
# http://www.chem.qmul.ac.uk/iubmb/newsletter/2009.html#item35
|
|
|
|
protein_letters_1to3 = {
|
|
"A": "Ala",
|
|
"C": "Cys",
|
|
"D": "Asp",
|
|
"E": "Glu",
|
|
"F": "Phe",
|
|
"G": "Gly",
|
|
"H": "His",
|
|
"I": "Ile",
|
|
"K": "Lys",
|
|
"L": "Leu",
|
|
"M": "Met",
|
|
"N": "Asn",
|
|
"P": "Pro",
|
|
"Q": "Gln",
|
|
"R": "Arg",
|
|
"S": "Ser",
|
|
"T": "Thr",
|
|
"V": "Val",
|
|
"W": "Trp",
|
|
"Y": "Tyr",
|
|
}
|
|
protein_letters_1to3_extended = {
|
|
**protein_letters_1to3,
|
|
**{"B": "Asx", "X": "Xaa", "Z": "Glx", "J": "Xle", "U": "Sec", "O": "Pyl"},
|
|
}
|
|
|
|
protein_letters_3to1 = {value: key for key, value in protein_letters_1to3.items()}
|
|
protein_letters_3to1_extended = {
|
|
value: key for key, value in protein_letters_1to3_extended.items()
|
|
}
|
|
|
|
ambiguous_dna_letters = "GATCRYWSMKHBVDN"
|
|
unambiguous_dna_letters = "GATC"
|
|
ambiguous_rna_letters = "GAUCRYWSMKHBVDN"
|
|
unambiguous_rna_letters = "GAUC"
|
|
|
|
# B == 5-bromouridine
|
|
# D == 5,6-dihydrouridine
|
|
# S == thiouridine
|
|
# W == wyosine
|
|
extended_dna_letters = "GATCBDSW"
|
|
|
|
# are there extended forms?
|
|
# extended_rna_letters = "GAUCBDSW"
|
|
|
|
# "X" is included in the following _values and _complement dictionaries,
|
|
# for historical reasons although it is not an IUPAC nucleotide,
|
|
# and so is not in the corresponding _letters strings above
|
|
ambiguous_dna_values = {
|
|
"A": "A",
|
|
"C": "C",
|
|
"G": "G",
|
|
"T": "T",
|
|
"M": "AC",
|
|
"R": "AG",
|
|
"W": "AT",
|
|
"S": "CG",
|
|
"Y": "CT",
|
|
"K": "GT",
|
|
"V": "ACG",
|
|
"H": "ACT",
|
|
"D": "AGT",
|
|
"B": "CGT",
|
|
"X": "GATC",
|
|
"N": "GATC",
|
|
}
|
|
ambiguous_rna_values = {
|
|
"A": "A",
|
|
"C": "C",
|
|
"G": "G",
|
|
"U": "U",
|
|
"M": "AC",
|
|
"R": "AG",
|
|
"W": "AU",
|
|
"S": "CG",
|
|
"Y": "CU",
|
|
"K": "GU",
|
|
"V": "ACG",
|
|
"H": "ACU",
|
|
"D": "AGU",
|
|
"B": "CGU",
|
|
"X": "GAUC",
|
|
"N": "GAUC",
|
|
}
|
|
|
|
ambiguous_dna_complement = {
|
|
"A": "T",
|
|
"C": "G",
|
|
"G": "C",
|
|
"T": "A",
|
|
"M": "K",
|
|
"R": "Y",
|
|
"W": "W",
|
|
"S": "S",
|
|
"Y": "R",
|
|
"K": "M",
|
|
"V": "B",
|
|
"H": "D",
|
|
"D": "H",
|
|
"B": "V",
|
|
"X": "X",
|
|
"N": "N",
|
|
}
|
|
|
|
ambiguous_rna_complement = {
|
|
"A": "U",
|
|
"C": "G",
|
|
"G": "C",
|
|
"U": "A",
|
|
"M": "K",
|
|
"R": "Y",
|
|
"W": "W",
|
|
"S": "S",
|
|
"Y": "R",
|
|
"K": "M",
|
|
"V": "B",
|
|
"H": "D",
|
|
"D": "H",
|
|
"B": "V",
|
|
"X": "X",
|
|
"N": "N",
|
|
}
|
|
|
|
|
|
def _make_ranges(mydict):
|
|
d = {}
|
|
for key, value in mydict.items():
|
|
d[key] = (value, value)
|
|
return d
|
|
|
|
|
|
# Mass data taken from PubChem
|
|
|
|
|
|
# Average masses of monophosphate deoxy nucleotides
|
|
unambiguous_dna_weights = {"A": 331.2218, "C": 307.1971, "G": 347.2212, "T": 322.2085}
|
|
|
|
# Monoisotopic masses of monophospate deoxy nucleotides
|
|
monoisotopic_unambiguous_dna_weights = {
|
|
"A": 331.06817,
|
|
"C": 307.056936,
|
|
"G": 347.063084,
|
|
"T": 322.056602,
|
|
}
|
|
|
|
unambiguous_dna_weight_ranges = _make_ranges(unambiguous_dna_weights)
|
|
|
|
unambiguous_rna_weights = {"A": 347.2212, "C": 323.1965, "G": 363.2206, "U": 324.1813}
|
|
|
|
monoisotopic_unambiguous_rna_weights = {
|
|
"A": 347.063084,
|
|
"C": 323.051851,
|
|
"G": 363.057999,
|
|
"U": 324.035867,
|
|
}
|
|
|
|
unambiguous_rna_weight_ranges = _make_ranges(unambiguous_rna_weights)
|
|
|
|
|
|
def _make_ambiguous_ranges(mydict, weight_table):
|
|
range_d = {}
|
|
avg_d = {}
|
|
for letter, values in mydict.items():
|
|
# Following line is a quick hack to skip undefined weights for U and O
|
|
if len(values) == 1 and values[0] not in weight_table:
|
|
continue
|
|
|
|
weights = [weight_table.get(x) for x in values]
|
|
range_d[letter] = (min(weights), max(weights))
|
|
total_w = 0.0
|
|
for w in weights:
|
|
total_w = total_w + w
|
|
avg_d[letter] = total_w / len(weights)
|
|
return range_d, avg_d
|
|
|
|
|
|
ambiguous_dna_weight_ranges, avg_ambiguous_dna_weights = _make_ambiguous_ranges(
|
|
ambiguous_dna_values, unambiguous_dna_weights
|
|
)
|
|
|
|
ambiguous_rna_weight_ranges, avg_ambiguous_rna_weights = _make_ambiguous_ranges(
|
|
ambiguous_rna_values, unambiguous_rna_weights
|
|
)
|
|
|
|
protein_weights = {
|
|
"A": 89.0932,
|
|
"C": 121.1582,
|
|
"D": 133.1027,
|
|
"E": 147.1293,
|
|
"F": 165.1891,
|
|
"G": 75.0666,
|
|
"H": 155.1546,
|
|
"I": 131.1729,
|
|
"K": 146.1876,
|
|
"L": 131.1729,
|
|
"M": 149.2113,
|
|
"N": 132.1179,
|
|
"O": 255.3134,
|
|
"P": 115.1305,
|
|
"Q": 146.1445,
|
|
"R": 174.201,
|
|
"S": 105.0926,
|
|
"T": 119.1192,
|
|
"U": 168.0532,
|
|
"V": 117.1463,
|
|
"W": 204.2252,
|
|
"Y": 181.1885,
|
|
}
|
|
|
|
monoisotopic_protein_weights = {
|
|
"A": 89.047678,
|
|
"C": 121.019749,
|
|
"D": 133.037508,
|
|
"E": 147.053158,
|
|
"F": 165.078979,
|
|
"G": 75.032028,
|
|
"H": 155.069477,
|
|
"I": 131.094629,
|
|
"K": 146.105528,
|
|
"L": 131.094629,
|
|
"M": 149.051049,
|
|
"N": 132.053492,
|
|
"O": 255.158292,
|
|
"P": 115.063329,
|
|
"Q": 146.069142,
|
|
"R": 174.111676,
|
|
"S": 105.042593,
|
|
"T": 119.058243,
|
|
"U": 168.964203,
|
|
"V": 117.078979,
|
|
"W": 204.089878,
|
|
"Y": 181.073893,
|
|
}
|
|
|
|
extended_protein_values = {
|
|
"A": "A",
|
|
"B": "ND",
|
|
"C": "C",
|
|
"D": "D",
|
|
"E": "E",
|
|
"F": "F",
|
|
"G": "G",
|
|
"H": "H",
|
|
"I": "I",
|
|
"J": "IL",
|
|
"K": "K",
|
|
"L": "L",
|
|
"M": "M",
|
|
"N": "N",
|
|
"O": "O",
|
|
"P": "P",
|
|
"Q": "Q",
|
|
"R": "R",
|
|
"S": "S",
|
|
"T": "T",
|
|
"U": "U",
|
|
"V": "V",
|
|
"W": "W",
|
|
"X": "ACDEFGHIKLMNPQRSTVWY",
|
|
# TODO - Include U and O in the possible values of X?
|
|
# This could alter the extended_protein_weight_ranges ...
|
|
# by MP: Won't do this, because they are so rare.
|
|
"Y": "Y",
|
|
"Z": "QE",
|
|
}
|
|
|
|
protein_weight_ranges = _make_ranges(protein_weights)
|
|
|
|
extended_protein_weight_ranges, avg_extended_protein_weights = _make_ambiguous_ranges(
|
|
extended_protein_values, protein_weights
|
|
)
|
|
|
|
|
|
# For Center of Mass Calculation.
|
|
# Taken from http://www.chem.qmul.ac.uk/iupac/AtWt/ & PyMol
|
|
atom_weights = {
|
|
"H": 1.00794,
|
|
"D": 2.01410,
|
|
"He": 4.002602,
|
|
"Li": 6.941,
|
|
"Be": 9.012182,
|
|
"B": 10.811,
|
|
"C": 12.0107,
|
|
"N": 14.0067,
|
|
"O": 15.9994,
|
|
"F": 18.9984032,
|
|
"Ne": 20.1797,
|
|
"Na": 22.989770,
|
|
"Mg": 24.3050,
|
|
"Al": 26.981538,
|
|
"Si": 28.0855,
|
|
"P": 30.973761,
|
|
"S": 32.065,
|
|
"Cl": 35.453,
|
|
"Ar": 39.948,
|
|
"K": 39.0983,
|
|
"Ca": 40.078,
|
|
"Sc": 44.955910,
|
|
"Ti": 47.867,
|
|
"V": 50.9415,
|
|
"Cr": 51.9961,
|
|
"Mn": 54.938049,
|
|
"Fe": 55.845,
|
|
"Co": 58.933200,
|
|
"Ni": 58.6934,
|
|
"Cu": 63.546,
|
|
"Zn": 65.39,
|
|
"Ga": 69.723,
|
|
"Ge": 72.64,
|
|
"As": 74.92160,
|
|
"Se": 78.96,
|
|
"Br": 79.904,
|
|
"Kr": 83.80,
|
|
"Rb": 85.4678,
|
|
"Sr": 87.62,
|
|
"Y": 88.90585,
|
|
"Zr": 91.224,
|
|
"Nb": 92.90638,
|
|
"Mo": 95.94,
|
|
"Tc": 98.0,
|
|
"Ru": 101.07,
|
|
"Rh": 102.90550,
|
|
"Pd": 106.42,
|
|
"Ag": 107.8682,
|
|
"Cd": 112.411,
|
|
"In": 114.818,
|
|
"Sn": 118.710,
|
|
"Sb": 121.760,
|
|
"Te": 127.60,
|
|
"I": 126.90447,
|
|
"Xe": 131.293,
|
|
"Cs": 132.90545,
|
|
"Ba": 137.327,
|
|
"La": 138.9055,
|
|
"Ce": 140.116,
|
|
"Pr": 140.90765,
|
|
"Nd": 144.24,
|
|
"Pm": 145.0,
|
|
"Sm": 150.36,
|
|
"Eu": 151.964,
|
|
"Gd": 157.25,
|
|
"Tb": 158.92534,
|
|
"Dy": 162.50,
|
|
"Ho": 164.93032,
|
|
"Er": 167.259,
|
|
"Tm": 168.93421,
|
|
"Yb": 173.04,
|
|
"Lu": 174.967,
|
|
"Hf": 178.49,
|
|
"Ta": 180.9479,
|
|
"W": 183.84,
|
|
"Re": 186.207,
|
|
"Os": 190.23,
|
|
"Ir": 192.217,
|
|
"Pt": 195.078,
|
|
"Au": 196.96655,
|
|
"Hg": 200.59,
|
|
"Tl": 204.3833,
|
|
"Pb": 207.2,
|
|
"Bi": 208.98038,
|
|
"Po": 208.98,
|
|
"At": 209.99,
|
|
"Rn": 222.02,
|
|
"Fr": 223.02,
|
|
"Ra": 226.03,
|
|
"Ac": 227.03,
|
|
"Th": 232.0381,
|
|
"Pa": 231.03588,
|
|
"U": 238.02891,
|
|
"Np": 237.05,
|
|
"Pu": 244.06,
|
|
"Am": 243.06,
|
|
"Cm": 247.07,
|
|
"Bk": 247.07,
|
|
"Cf": 251.08,
|
|
"Es": 252.08,
|
|
"Fm": 257.10,
|
|
"Md": 258.10,
|
|
"No": 259.10,
|
|
"Lr": 262.11,
|
|
"Rf": 261.11,
|
|
"Db": 262.11,
|
|
"Sg": 266.12,
|
|
"Bh": 264.12,
|
|
"Hs": 269.13,
|
|
"Mt": 268.14,
|
|
}
|