mirror of
https://github.com/biopython/biopython.git
synced 2025-10-20 21:53:47 +08:00
135 lines
4.2 KiB
Python
135 lines
4.2 KiB
Python
# Copyright 2002 by Jeffrey Chang.
|
|
# All rights reserved.
|
|
#
|
|
# This file is part of the Biopython distribution and governed by your
|
|
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
|
|
# Please see the LICENSE file that should have been included as part of this
|
|
# package.
|
|
"""Code for doing logistic regressions.
|
|
|
|
Classes:
|
|
- LogisticRegression Holds information for a LogisticRegression classifier.
|
|
|
|
Functions:
|
|
- train Train a new classifier.
|
|
- calculate Calculate the probabilities of each class, given an observation.
|
|
- classify Classify an observation into a class.
|
|
"""
|
|
|
|
|
|
import numpy
|
|
import numpy.linalg
|
|
|
|
|
|
class LogisticRegression:
|
|
"""Holds information necessary to do logistic regression classification.
|
|
|
|
Attributes:
|
|
- beta - List of the weights for each dimension.
|
|
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize the class."""
|
|
self.beta = []
|
|
|
|
|
|
def train(xs, ys, update_fn=None, typecode=None):
|
|
"""Train a logistic regression classifier on a training set.
|
|
|
|
Argument xs is a list of observations and ys is a list of the class
|
|
assignments, which should be 0 or 1. xs and ys should contain the
|
|
same number of elements. update_fn is an optional callback function
|
|
that takes as parameters that iteration number and log likelihood.
|
|
"""
|
|
if len(xs) != len(ys):
|
|
raise ValueError("xs and ys should be the same length.")
|
|
classes = set(ys)
|
|
if classes != {0, 1}:
|
|
raise ValueError("Classes should be 0's and 1's")
|
|
if typecode is None:
|
|
typecode = "d"
|
|
|
|
# Dimensionality of the data is the dimensionality of the
|
|
# observations plus a constant dimension.
|
|
N, ndims = len(xs), len(xs[0]) + 1
|
|
if N == 0 or ndims == 1:
|
|
raise ValueError("No observations or observation of 0 dimension.")
|
|
|
|
# Make an X array, with a constant first dimension.
|
|
X = numpy.ones((N, ndims), typecode)
|
|
X[:, 1:] = xs
|
|
Xt = numpy.transpose(X)
|
|
y = numpy.asarray(ys, typecode)
|
|
|
|
# Initialize the beta parameter to 0.
|
|
beta = numpy.zeros(ndims, typecode)
|
|
|
|
MAX_ITERATIONS = 500
|
|
CONVERGE_THRESHOLD = 0.01
|
|
stepsize = 1.0
|
|
# Now iterate using Newton-Raphson until the log-likelihoods
|
|
# converge.
|
|
i = 0
|
|
old_beta = old_llik = None
|
|
while i < MAX_ITERATIONS:
|
|
# Calculate the probabilities. p = e^(beta X) / (1+e^(beta X))
|
|
ebetaX = numpy.exp(numpy.dot(beta, Xt))
|
|
p = ebetaX / (1 + ebetaX)
|
|
|
|
# Find the log likelihood score and see if I've converged.
|
|
logp = y * numpy.log(p) + (1 - y) * numpy.log(1 - p)
|
|
llik = sum(logp)
|
|
if update_fn is not None:
|
|
update_fn(iter, llik)
|
|
if old_llik is not None:
|
|
# Check to see if the likelihood decreased. If it did, then
|
|
# restore the old beta parameters and half the step size.
|
|
if llik < old_llik:
|
|
stepsize /= 2.0
|
|
beta = old_beta
|
|
# If I've converged, then stop.
|
|
if numpy.fabs(llik - old_llik) <= CONVERGE_THRESHOLD:
|
|
break
|
|
old_llik, old_beta = llik, beta
|
|
i += 1
|
|
|
|
W = numpy.identity(N) * p
|
|
Xtyp = numpy.dot(Xt, y - p) # Calculate the first derivative.
|
|
XtWX = numpy.dot(numpy.dot(Xt, W), X) # Calculate the second derivative.
|
|
delta = numpy.linalg.solve(XtWX, Xtyp)
|
|
if numpy.fabs(stepsize - 1.0) > 0.001:
|
|
delta *= stepsize
|
|
beta += delta # Update beta.
|
|
else:
|
|
raise RuntimeError("Didn't converge.")
|
|
|
|
lr = LogisticRegression()
|
|
lr.beta = [float(x) for x in beta] # Convert back to regular array.
|
|
return lr
|
|
|
|
|
|
def calculate(lr, x):
|
|
"""Calculate the probability for each class.
|
|
|
|
Arguments:
|
|
- lr is a LogisticRegression object.
|
|
- x is the observed data.
|
|
|
|
Returns a list of the probability that it fits each class.
|
|
"""
|
|
# Insert a constant term for x.
|
|
x = numpy.asarray([1.0] + x)
|
|
# Calculate the probability. p = e^(beta X) / (1+e^(beta X))
|
|
ebetaX = numpy.exp(numpy.dot(lr.beta, x))
|
|
p = ebetaX / (1 + ebetaX)
|
|
return [1 - p, p]
|
|
|
|
|
|
def classify(lr, x):
|
|
"""Classify an observation into a class."""
|
|
probs = calculate(lr, x)
|
|
if probs[0] > probs[1]:
|
|
return 0
|
|
return 1
|