Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/deploy_docs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,15 @@ DEST_SLUG=biopython/docs
# Biopython was installed to run Sphinx and build the docs,
# can use this:
DEST_DIR=`python -c "import Bio; v=Bio.__version__; print('dev' if 'dev' in v else v)"`
SOURCE_DIR=${TRAVIS_BUILD_DIR:-$PWD}/Doc/api/_build/html
SOURCE_DIR=${TRAVIS_BUILD_DIR:-$PWD}/Doc/_build/html
WORKING_DIR=/tmp/deploy_biopython_docs

if [ -z "$DEST_DIR" ]; then
echo "ERROR: Failed to get Biopython version, is it not installed?"
python -c "import Bio; print(Bio.__version__)"
false
fi
DEST_DIR=$DEST_DIR/api
DEST_DIR=$DEST_DIR/
echo "Aiming to deploy $SOURCE_DIR to $DEST_SLUG branch gh-pages as $DEST_DIR"

# On TravisCI, must create the variable using '\ ' and '\n', so
Expand Down
47 changes: 47 additions & 0 deletions .github/ref_fix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/env python
import re
import sys

# re_link = re.compile(r"`\[(.+)\] <#(.+)>`__")
re_link = re.compile(r"`\[([A-Za-z0-9_:\-.]+)\] <#([A-Za-z0-9_:\-.]+)>`__")

assert re_link.findall(r"Chapter \ `[chapter:quick_start] <#chapter:quick_start>`__ before\n")
assert re_link.findall(r"(see Section `[sec:appendix-handles] <#sec:appendix-handles>`__):")
assert re_link.findall(r"Section `[sec:Bio.SeqIO-and-StringIO] <#sec:Bio.SeqIO-and-StringIO>`__):")
assert re_link.findall(r"Figure `[fig:three_track_cl2] <#fig:three_track_cl2>`__.")
assert list(re_link.finditer(r"in Chapter \ `[chapter:seq_annot] <#chapter:seq_annot>`__. This aims to"))
assert len(list(re_link.finditer(r"functions (`[eq:OP] <#eq:OP>`__) and (`[eq:NOP] <#eq:NOP>`__)."))) == 2

re_section = re.compile(r"`[0-9.]+ <#([A-Za-z0-9_:\-.]+)>`__")

assert re_section.findall(r"the label’s color (used in Section `1.1.9 <#sec:gd_nice_example>`__).")

def fix_line(line):
# e.g. Chapter \ `[chapter:quick_start] <#chapter:quick_start>`__
line = line.replace("\xa0\\ ", " ")
for match in re_link.finditer(line):
old = match.group()
ref = match.group(1)
assert ref == match.group(2), old
new = r":ref:`%s`" % ref
line = line.replace(old, new)
print("%s -> %s" % (old, new))
for match in re_section.finditer(line):
old = match.group()
ref = match.group(1)
assert old.endswith("<#%s>`__" % ref), old
new = r":ref:`%s`" % ref
line = line.replace(old, new)
print("%s -> %s" % (old, new))
return line

def fix_file(filename):
with open(filename) as handle:
lines = list(handle)
with open(filename, "w") as handle:
for line in lines:
handle.write(fix_line(line))

for f in sys.argv[1:]:
sys.stderr.write("Fixing %s\n" % f)
fix_file(f)
3 changes: 3 additions & 0 deletions .github/requirements-rtd.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# dot to mean current dir, i.e. Biopython itself
.
numpydoc
20 changes: 20 additions & 0 deletions .github/rst_chunk.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/usr/bin/env python
import sys

if len(sys.argv) == 2:
prefix = sys.argv[1]
else:
prefix = "chapter"
marker = ".. _%s" % prefix

handle = sys.stdout
for line in sys.stdin:
if line.rstrip().startswith(marker) and line.rstrip()[-1] == ":":
if handle != sys.stdout:
handle.close()
filename = "%s_%s.rst" % (prefix, line.rstrip()[len(marker) + 1:-1])
sys.stderr.write("Starting %s\n" % filename)
handle = open(filename, "w")
handle.write(line)
if handle != sys.stdout:
handle.close()
4 changes: 2 additions & 2 deletions .travis-tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -166,5 +166,5 @@ deps =
sphinx_rtd_theme
commands =
bash -c \'python setup.py install > /dev/null\'
bash -c \'mkdir -p Doc/api/_templates Doc/api/_static Doc/api/_build\'
make -C Doc/api/ html
bash -c \'mkdir -p Doc/_templates Doc/_static Doc/_build\'
make -C Doc/ html
71 changes: 20 additions & 51 deletions Doc/Makefile
Original file line number Diff line number Diff line change
@@ -1,51 +1,20 @@
subdirs :=


all: Tutorial.html Tutorial.txt pdf $(subdirs)
pdf: Tutorial.pdf biopdb_faq.pdf

Tutorial.pdf: Tutorial.tex Tutorial/chapter_*.tex
pdflatex --shell-escape Tutorial.tex
pdflatex --shell-escape Tutorial.tex
pdflatex --shell-escape Tutorial.tex

biopdb_faq.pdf: biopdb_faq.tex
pdflatex biopdb_faq.tex
pdflatex biopdb_faq.tex
pdflatex biopdb_faq.tex

Tutorial.html: Tutorial.tex Tutorial/chapter_*.tex
hevea -fix Tutorial.tex

Tutorial.txt: Tutorial.tex Tutorial/chapter_*.tex
hevea -fix -text Tutorial.tex

clean-subdirs: $(subdirs)
( for f in $^ ; do $(MAKE) clean -C $$f ; done )

clean: clean-subdirs
rm -f Tutorial.aux
rm -f Tutorial.toc
rm -f Tutorial.log
rm -f Tutorial.out
rm -f Tutorial.haux
rm -f Tutorial.htoc
rm -f biopdb_faq.aux
rm -f biopdb_faq.log
rm -f biopdb_faq.out
rm -f Tutorial/*.aux

distclean-subdirs: $(subdirs)
( for f in $^ ; do $(MAKE) distclean -C $$f ; done )

distclean: clean distclean-subdirs
rm -f biopdb_faq.pdf
rm -f Tutorial.pdf
rm -f Tutorial.html
rm -f Tutorial.txt
rm -f *_motif.gif #output from hacha

.PHONY: $(subdirs)
$(subdirs):
$(MAKE) -C $@

# Minimal makefile for Sphinx documentation
#

# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = Bio
SOURCEDIR = .
BUILDDIR = _build

# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)

.PHONY: help Makefile

# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
40 changes: 1 addition & 39 deletions Doc/Tutorial.tex
Original file line number Diff line number Diff line change
Expand Up @@ -45,57 +45,18 @@
\documentclass{report}
\usepackage{url}
\usepackage{fullpage}
\usepackage{hevea}
\usepackage{graphicx}

% For syntax coloring of python, pycon, bash etc in pdflatex:
\usepackage{minted}
% Minted fails on hevea, https://github.com/gpoore/minted/issues/234
% silently fall back on verbatim - ignore the language argument:
%HEVEA \newenvironment{minted}[1]{\verbatim}{\endverbatim}

% make everything have section numbers
\setcounter{secnumdepth}{4}

% Make links between references
\usepackage{hyperref}
\newif\ifpdf
\ifx\pdfoutput\undefined
\pdffalse
\else
\pdfoutput=1
\pdftrue
\fi
\ifpdf
\hypersetup{colorlinks=true, hyperindex=true, citecolor=red, urlcolor=blue}
\fi

\begin{document}

\begin{htmlonly}
\title{Biopython Tutorial and Cookbook}
\end{htmlonly}
\begin{latexonly}
\title{
%Hack to get the logo on the PDF front page:
\includegraphics[width=\textwidth]{images/biopython_logo.pdf}\\
%Hack to get some white space using a blank line:
~\\
Biopython Tutorial and Cookbook}
\end{latexonly}

\author{Jeff Chang, Brad Chapman, Iddo Friedberg, Thomas Hamelryck, \\
Michiel de Hoon, Peter Cock, Tiago Antao, Eric Talevich, Bartek Wilczy\'{n}ski}
\date{Last Update -- 22 July 2019 (Biopython 1.75.dev0)}

%Hack to get the logo at the start of the HTML front page:
%(hopefully this isn't going to be too wide for most people)
\begin{rawhtml}
<P ALIGN="center">
<IMG ALIGN="center" SRC="images/biopython_logo.svg" TITLE="Biopython Logo" ALT="[Biopython Logo]" width="450" height="300" />
</p>
\end{rawhtml}

\maketitle
\tableofcontents

Expand All @@ -118,6 +79,7 @@
\include{Tutorial/chapter_graphics}
\include{Tutorial/chapter_kegg}
\include{Tutorial/chapter_phenotype}
%\include{Tutorial/chapter_codonalign}
\include{Tutorial/chapter_cookbook}
\include{Tutorial/chapter_testing}
\include{Tutorial/chapter_advanced}
Expand Down
File renamed without changes.
8 changes: 0 additions & 8 deletions Doc/Tutorial/chapter_introduction.tex
Original file line number Diff line number Diff line change
Expand Up @@ -118,21 +118,13 @@ \section{Frequently Asked Questions (FAQ)}
licensed under your choice of the \emph{Biopython License Agreement} or
the \emph{BSD 3-Clause License}.

\begin{latexonly}
\includegraphics[width=6cm]{images/biopython_logo.pdf}\\
\end{latexonly}
\begin{rawhtml}
<IMG ALIGN="center" SRC="images/biopython_logo.svg" TITLE="Biopython Logo (2017 onwards)" ALT="[New Biopython Logo]" width="300" height="200" />
<IMG ALIGN="center" SRC="images/biopython_logo_old.jpg" TITLE="Old Biopython Logo (2003-2017)" ALT="[Old Biopython Logo]" width="512" height="144" />
\end{rawhtml}

Prior to this, the Biopython logo was two yellow snakes forming a double
helix around the word ``BIOPYTHON'', designed by Henrik Vestergaard and
Thomas Hamelryck in 2003 as part of an open competition.

\begin{latexonly}
\includegraphics[width=7cm]{images/biopython_logo_old.jpg}\\
\end{latexonly}

\item \emph{Do you have a change-log listing what's new in each release?} \\
See the file \verb|NEWS.rst| included with the source code (originally called
Expand Down
14 changes: 8 additions & 6 deletions Doc/Tutorial/chapter_learning.tex
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ \section{The Logistic Regression Model}
\label{sec:LogisticRegression}

\subsection{Background and Purpose}
\label{sec:LogisticRegressionBackground}

Logistic regression is a supervised learning approach that attempts to distinguish $K$ classes from each other using a weighted sum of some predictor variables $x_i$. The logistic regression model is used to calculate the weights $\beta_i$ of the predictor variables. In Biopython, the logistic regression model is currently implemented for two classes only ($K = 2$); the number of predictor variables has no predefined limit.

Expand Down Expand Up @@ -34,10 +35,10 @@ \subsection{Background and Purpose}

In the logistic regression model, the probability of belonging to a class depends on the score via the logistic function. For the two classes OP and NOP, we can write this as
\begin{eqnarray}
\Pr(\mathrm{OP}|x_1, x_2) & = & \frac{\exp(\beta_0 + \beta_1 x_1 + \beta_2 x_2)}{1+\exp(\beta_0 + \beta_1 x_1 + \beta_2 x_2)} \label{eq:OP} \\
\Pr(\mathrm{NOP}|x_1, x_2) & = & \frac{1}{1+\exp(\beta_0 + \beta_1 x_1 + \beta_2 x_2)} \label{eq:NOP}
\Pr(\mathrm{OP}|x_1, x_2) & = & \frac{\exp(\beta_0 + \beta_1 x_1 + \beta_2 x_2)}{1+\exp(\beta_0 + \beta_1 x_1 + \beta_2 x_2)} \\
\Pr(\mathrm{NOP}|x_1, x_2) & = & \frac{1}{1+\exp(\beta_0 + \beta_1 x_1 + \beta_2 x_2)}
\end{eqnarray}
Using a set of gene pairs for which it is known whether they belong to the same operon (class OP) or to different operons (class NOP), we can calculate the weights $\beta_0$, $\beta_1$, $\beta_2$ by maximizing the log-likelihood corresponding to the probability functions (\ref{eq:OP}) and (\ref{eq:NOP}).
Using a set of gene pairs for which it is known whether they belong to the same operon (class OP) or to different operons (class NOP), we can calculate the weights $\beta_0$, $\beta_1$, $\beta_2$ by maximizing the log-likelihood corresponding to these probability functions.

\subsection{Training the logistic regression model}
\label{sec:LogisticRegressionTraining}
Expand Down Expand Up @@ -208,7 +209,8 @@ \subsection{Using the logistic regression model for classification}
\end{minted}
(which, by the way, agrees with the biological literature).

To find out how confident we can be in these predictions, we can call the \verb+calculate+ function to obtain the probabilities (equations (\ref{eq:OP}) and \ref{eq:NOP}) for class OP and NOP. For \textit{yxcE}, \textit{yxcD} we find
To find out how confident we can be in these predictions, we can call the \verb+calculate+ function to obtain the probabilities for class OP and NOP (using the equations introduced in Section~\ref{sec:LogisticRegressionBackground}).
For \textit{yxcE}, \textit{yxcD} we find
\begin{minted}{pycon}
>>> q, p = LogisticRegression.calculate(model, [6, -173.143442352])
>>> print("class OP: probability =", p, "class NOP: probability =", q)
Expand Down Expand Up @@ -270,9 +272,9 @@ \subsection{Using the logistic regression model for classification}

\subsection{Logistic Regression, Linear Discriminant Analysis, and Support Vector Machines}

The logistic regression model is similar to linear discriminant analysis. In linear discriminant analysis, the class probabilities also follow equations (\ref{eq:OP}) and (\ref{eq:NOP}). However, instead of estimating the coefficients $\beta$ directly, we first fit a normal distribution to the predictor variables $x$. The coefficients $\beta$ are then calculated from the means and covariances of the normal distribution. If the distribution of $x$ is indeed normal, then we expect linear discriminant analysis to perform better than the logistic regression model. The logistic regression model, on the other hand, is more robust to deviations from normality.
The logistic regression model is similar to linear discriminant analysis. In linear discriminant analysis, the class probabilities also follow the OP and NOP equations introduced in Section~\ref{sec:LogisticRegressionBackground}. However, instead of estimating the coefficients $\beta$ directly, we first fit a normal distribution to the predictor variables $x$. The coefficients $\beta$ are then calculated from the means and covariances of the normal distribution. If the distribution of $x$ is indeed normal, then we expect linear discriminant analysis to perform better than the logistic regression model. The logistic regression model, on the other hand, is more robust to deviations from normality.

Another similar approach is a support vector machine with a linear kernel. Such an SVM also uses a linear combination of the predictors, but estimates the coefficients $\beta$ from the predictor variables $x$ near the boundary region between the classes. If the logistic regression model (equations (\ref{eq:OP}) and (\ref{eq:NOP})) is a good description for $x$ away from the boundary region, we expect the logistic regression model to perform better than an SVM with a linear kernel, as it relies on more data. If not, an SVM with a linear kernel may perform better.
Another similar approach is a support vector machine with a linear kernel. Such an SVM also uses a linear combination of the predictors, but estimates the coefficients $\beta$ from the predictor variables $x$ near the boundary region between the classes. If the logistic regression model (OP and NOP equations in Section~\ref{sec:LogisticRegressionBackground})) is a good description for $x$ away from the boundary region, we expect the logistic regression model to perform better than an SVM with a linear kernel, as it relies on more data. If not, an SVM with a linear kernel may perform better.

Trevor Hastie, Robert Tibshirani, and Jerome Friedman: \textit{The Elements of Statistical Learning. Data Mining, Inference, and Prediction}. Springer Series in Statistics, 2001. Chapter 4.4.

Expand Down
20 changes: 0 additions & 20 deletions Doc/api/Makefile

This file was deleted.

Loading