mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
$ ruff check --fix --select=I \ --config=lint.isort.force-single-line=true \ --config=lint.isort.order-by-type=false \ BioSQL/ Bio/ Tests/ Scripts/ Doc/ setup.py Using ruff version 0.4.10
226 lines
8.1 KiB
Python
226 lines
8.1 KiB
Python
# Copyright 2000 by Jeffrey Chang. All rights reserved.
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
|
|
"""General Naive Bayes learner (DEPRECATED).
|
|
|
|
Naive Bayes is a supervised classification algorithm that uses Bayes
|
|
rule to compute the fit between a new observation and some previously
|
|
observed data. The observations are discrete feature vectors, with
|
|
the Bayes assumption that the features are independent. Although this
|
|
is hardly ever true, the classifier works well enough in practice.
|
|
|
|
Glossary:
|
|
- observation - A feature vector of discrete data.
|
|
- class - A possible classification for an observation.
|
|
|
|
Classes:
|
|
- NaiveBayes - Holds information for a naive Bayes classifier.
|
|
|
|
Functions:
|
|
- train - Train a new naive Bayes classifier.
|
|
- calculate - Calculate the probabilities of each class,
|
|
given an observation.
|
|
- classify - Classify an observation into a class.
|
|
|
|
"""
|
|
|
|
import warnings
|
|
|
|
from Bio import BiopythonDeprecationWarning
|
|
|
|
warnings.warn(
|
|
"The 'Bio.NaiveBayes' module is deprecated and will be removed in a future "
|
|
"release of Biopython. Consider using scikit-learn instead.",
|
|
BiopythonDeprecationWarning,
|
|
)
|
|
|
|
|
|
try:
|
|
import numpy as np
|
|
except ImportError:
|
|
from Bio import MissingPythonDependencyError
|
|
|
|
raise MissingPythonDependencyError(
|
|
"Please install NumPy if you want to use Bio.NaiveBayes. "
|
|
"See http://www.numpy.org/"
|
|
) from None
|
|
|
|
|
|
def _contents(items):
|
|
"""Return a dictionary where the key is the item and the value is the probability associated (PRIVATE)."""
|
|
term = 1.0 / len(items)
|
|
counts = {}
|
|
for item in items:
|
|
counts[item] = counts.get(item, 0) + term
|
|
return counts
|
|
|
|
|
|
class NaiveBayes:
|
|
"""Hold information for a NaiveBayes classifier.
|
|
|
|
Attributes:
|
|
- classes - List of the possible classes of data.
|
|
- p_conditional - CLASS x DIM array of dicts of value -> ``P(value|class,dim)``
|
|
- p_prior - List of the prior probabilities for every class.
|
|
- dimensionality - Dimensionality of the data.
|
|
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the class."""
|
|
self.classes = []
|
|
self.p_conditional = None
|
|
self.p_prior = []
|
|
self.dimensionality = None
|
|
|
|
|
|
def calculate(nb, observation, scale=False):
|
|
"""Calculate the logarithmic conditional probability for each class.
|
|
|
|
Arguments:
|
|
- nb - A NaiveBayes classifier that has been trained.
|
|
- observation - A list representing the observed data.
|
|
- scale - Boolean to indicate whether the probability should be
|
|
scaled by ``P(observation)``. By default, no scaling is done.
|
|
|
|
A dictionary is returned where the key is the class and the value is
|
|
the log probability of the class.
|
|
"""
|
|
# P(class|observation) = P(observation|class)*P(class)/P(observation)
|
|
# Taking the log:
|
|
# lP(class|observation) = lP(observation|class)+lP(class)-lP(observation)
|
|
|
|
# Make sure the observation has the right dimensionality.
|
|
if len(observation) != nb.dimensionality:
|
|
raise ValueError(
|
|
f"observation in {len(observation)} dimension,"
|
|
f" but classifier in {nb.dimensionality}"
|
|
)
|
|
|
|
# Calculate log P(observation|class) for every class.
|
|
n = len(nb.classes)
|
|
lp_observation_class = np.zeros(n) # array of log P(observation|class)
|
|
for i in range(n):
|
|
# log P(observation|class) = SUM_i log P(observation_i|class)
|
|
probs = [None] * len(observation)
|
|
for j in range(len(observation)):
|
|
probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
|
|
lprobs = np.log(np.clip(probs, 1.0e-300, 1.0e300))
|
|
lp_observation_class[i] = sum(lprobs)
|
|
|
|
# Calculate log P(class).
|
|
lp_prior = np.log(nb.p_prior)
|
|
|
|
# Calculate log P(observation).
|
|
lp_observation = 0.0 # P(observation)
|
|
if scale: # Only calculate this if requested.
|
|
# log P(observation) = log SUM_i P(observation|class_i)P(class_i)
|
|
obs = np.exp(np.clip(lp_prior + lp_observation_class, -700, +700))
|
|
lp_observation = np.log(sum(obs))
|
|
|
|
# Calculate log P(class|observation).
|
|
lp_class_observation = {} # Dict of class : log P(class|observation)
|
|
for i in range(len(nb.classes)):
|
|
lp_class_observation[nb.classes[i]] = (
|
|
lp_observation_class[i] + lp_prior[i] - lp_observation
|
|
)
|
|
|
|
return lp_class_observation
|
|
|
|
|
|
def classify(nb, observation):
|
|
"""Classify an observation into a class."""
|
|
# The class is the one with the highest probability.
|
|
probs = calculate(nb, observation, scale=False)
|
|
max_prob = max_class = None
|
|
for klass in nb.classes:
|
|
if max_prob is None or probs[klass] > max_prob:
|
|
max_prob, max_class = probs[klass], klass
|
|
return max_class
|
|
|
|
|
|
def train(training_set, results, priors=None, typecode=None):
|
|
"""Train a NaiveBayes classifier on a training set.
|
|
|
|
Arguments:
|
|
- training_set - List of observations.
|
|
- results - List of the class assignments for each observation.
|
|
Thus, training_set and results must be the same length.
|
|
- priors - Optional dictionary specifying the prior probabilities
|
|
for each type of result. If not specified, the priors will
|
|
be estimated from the training results.
|
|
|
|
"""
|
|
if not len(training_set):
|
|
raise ValueError("No data in the training set.")
|
|
if len(training_set) != len(results):
|
|
raise ValueError("training_set and results should be parallel lists.")
|
|
|
|
# If no typecode is specified, try to pick a reasonable one. If
|
|
# training_set is a Numeric array, then use that typecode.
|
|
# Otherwise, choose a reasonable default.
|
|
# XXX NOT IMPLEMENTED
|
|
|
|
# Check to make sure each vector in the training set has the same
|
|
# dimensionality.
|
|
dimensions = [len(x) for x in training_set]
|
|
if min(dimensions) != max(dimensions):
|
|
raise ValueError("observations have different dimensionality")
|
|
|
|
nb = NaiveBayes()
|
|
nb.dimensionality = dimensions[0]
|
|
|
|
# Get a list of all the classes, and
|
|
# estimate the prior probabilities for the classes.
|
|
if priors is not None:
|
|
percs = priors
|
|
nb.classes = list(set(results))
|
|
else:
|
|
class_freq = _contents(results)
|
|
nb.classes = list(class_freq.keys())
|
|
percs = class_freq
|
|
nb.classes.sort() # keep it tidy
|
|
|
|
nb.p_prior = np.zeros(len(nb.classes))
|
|
for i in range(len(nb.classes)):
|
|
nb.p_prior[i] = percs[nb.classes[i]]
|
|
|
|
# Collect all the observations in class. For each class, make a
|
|
# matrix of training instances versus dimensions. I might be able
|
|
# to optimize this with Numeric, if the training_set parameter
|
|
# were guaranteed to be a matrix. However, this may not be the
|
|
# case, because the client may be hacking up a sparse matrix or
|
|
# something.
|
|
c2i = {} # class to index of class
|
|
for index, key in enumerate(nb.classes):
|
|
c2i[key] = index
|
|
observations = [[] for c in nb.classes] # separate observations by class
|
|
for i in range(len(results)):
|
|
klass, obs = results[i], training_set[i]
|
|
observations[c2i[klass]].append(obs)
|
|
# Now make the observations Numeric matrix.
|
|
for i in range(len(observations)):
|
|
# XXX typecode must be specified!
|
|
observations[i] = np.asarray(observations[i], typecode)
|
|
|
|
# Calculate P(value|class,dim) for every class.
|
|
# This is a good loop to optimize.
|
|
nb.p_conditional = []
|
|
for i in range(len(nb.classes)):
|
|
class_observations = observations[i] # observations for this class
|
|
nb.p_conditional.append([None] * nb.dimensionality)
|
|
for j in range(nb.dimensionality):
|
|
# Collect all the values in this dimension.
|
|
values = class_observations[:, j]
|
|
|
|
# Add pseudocounts here. This needs to be parameterized.
|
|
# values = list(values) + range(len(nb.classes)) # XXX add 1
|
|
|
|
# Estimate P(value|class,dim)
|
|
nb.p_conditional[i][j] = _contents(values)
|
|
return nb
|