TIANQIXU1
diff --git a/‎docs/conf.py‎
Lines changed: 1 addition & 1 deletion b/‎docs/conf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/numpy_ml.linear_models.lm.rst‎
Lines changed: 16 additions & 0 deletions b/‎docs/numpy_ml.linear_models.lm.rst‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎docs/numpy_ml.linear_models.rst‎
Lines changed: 77 additions & 35 deletions b/‎docs/numpy_ml.linear_models.rst‎
Lines changed: 77 additions & 35 deletions
diff --git a/‎numpy_ml/linear_models/glm.py‎
Lines changed: 18 additions & 18 deletions b/‎numpy_ml/linear_models/glm.py‎
Lines changed: 18 additions & 18 deletions
diff --git a/‎numpy_ml/linear_models/linear_regression.py‎
Lines changed: 8 additions & 8 deletions b/‎numpy_ml/linear_models/linear_regression.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎numpy_ml/linear_models/naive_bayes.py‎
Lines changed: 5 additions & 4 deletions b/‎numpy_ml/linear_models/naive_bayes.py‎
Lines changed: 5 additions & 4 deletions
@@ -24,7 +24,7 @@
 # -- Project information -----------------------------------------------------
 
 project = "numpy-ml"
-copyright = "2020, David Bourgin"
+copyright = "2022, David Bourgin"
 author = "David Bourgin"
 
 # The short X.Y version
 
@@ -37,3 +37,19 @@
 :members:
 :undoc-members:
 :inherited-members:
+
+``GaussianNBClassifier``
+-----------------------------------------
+
+.. autoclass:: numpy_ml.linear_models.GaussianNBClassifier
+:members:
+:undoc-members:
+:inherited-members:
+
+``GeneralizedLinearModel``
+-----------------------------------------
+
+.. autoclass:: numpy_ml.linear_models.GeneralizedLinearModel
+:members:
+:undoc-members:
+:inherited-members:
@@ -1,47 +1,39 @@
 Linear models
 #############
 
-The linear models module contains several popular instances of the generalized linear model (GLM).
-
 .. raw:: html
 
- <h2>Linear Regression</h2>
-
-The simple linear regression model is
-
-.. math::
-
- \mathbf{y} = \mathbf{bX} + \mathbf{\epsilon}
-
-where
-
-.. math::
-
- \epsilon \sim \mathcal{N}(0, \sigma^2 I)
+ <h2>Ordinary and Weighted Linear Least Squares</h2>
 
-In probabilistic terms this corresponds to
+In weighted linear least-squares regression (WLS), a real-valued target
+:math:`y_i`, is modeled as a linear combination of covariates
+:math:`\mathbf{x}_i` and model coefficients **b**:
 
 .. math::
 
- \mathbf{y} - \mathbf{bX} &\sim \mathcal{N}(0, \sigma^2 I) \\
- \mathbf{y} \mid \mathbf{X}, \mathbf{b} &\sim \mathcal{N}(\mathbf{bX}, \sigma^2 I)
+ y_i = \mathbf{b}^\top \mathbf{x}_i + \epsilon_i
 
-The loss for the model is simply the squared error between the model
-predictions and the true values:
+In the above equation, :math:`\epsilon_i \sim \mathcal{N}(0, \sigma_i^2)` is a
+normally distributed error term with variance :math:`\sigma_i^2`. Ordinary
+least squares (OLS) is a special case of this model where the variance is fixed
+across all examples, i.e., :math:`\sigma_i = \sigma_j \ \forall i,j`. The
+maximum likelihood model parameters, :math:`\hat{\mathbf{b}}_{WLS}`, are those
+that minimize the weighted squared error between the model predictions and the
+true values:
 
 .. math::
 
- \mathcal{L} = ||\mathbf{y} - \mathbf{bX}||_2^2
+ \mathcal{L} = ||\mathbf{W}^{0.5}(\mathbf{y} - \mathbf{bX})||_2^2
 
-The MLE for the model parameters **b** can be computed in closed form via
-the normal equation:
+where :math:`\mathbf{W}` is a diagonal matrix of the example weights. In OLS,
+:math:`\mathbf{W}` is the identity matrix. The maximum likelihood estimate for
+the model parameters can be computed in closed-form using the normal equations:
 
 .. math::
 
- \mathbf{b}_{MLE} = (\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top \mathbf{y}
+ \hat{\mathbf{b}}_{WLS} =
+ (\mathbf{X}^\top \mathbf{WX})^{-1} \mathbf{X}^\top \mathbf{Wy}
 
-where :math:`(\mathbf{X}^\top \mathbf{X})^{-1} \mathbf{X}^\top` is known
-as the pseudoinverse / Moore-Penrose inverse.
 
 **Models**
 
@@ -55,19 +47,14 @@ Ridge regression uses the same simple linear regression model but adds an
 additional penalty on the `L2`-norm of the coefficients to the loss function.
 This is sometimes known as Tikhonov regularization.
 
-In particular, the ridge model is still simply
+In particular, the ridge model is the same as the OLS model:
 
 .. math::
 
  \mathbf{y} = \mathbf{bX} + \mathbf{\epsilon}
 
-where
-
-.. math::
-
- \epsilon \sim \mathcal{N}(0, \sigma^2 I)
-
-except now the error for the model is calcualted as
+where :math:`\epsilon \sim \mathcal{N}(0, \sigma^2 I)`, except now the error
+for the model is calculated as
 
 .. math::
 
@@ -78,7 +65,8 @@ the adjusted normal equation:
 
 .. math::
 
- \mathbf{b}_{MLE} = (\mathbf{X}^\top \mathbf{X} + \alpha I)^{-1} \mathbf{X}^\top \mathbf{y}
+ \hat{\mathbf{b}}_{Ridge} =
+ (\mathbf{X}^\top \mathbf{X} + \alpha I)^{-1} \mathbf{X}^\top \mathbf{y}
 
 where :math:`(\mathbf{X}^\top \mathbf{X} + \alpha I)^{-1}
 \mathbf{X}^\top` is the pseudoinverse / Moore-Penrose inverse adjusted for
@@ -235,6 +223,60 @@ We can also compute a closed-form solution for the posterior predictive distribu
 
 - :class:`~numpy_ml.linear_models.BayesianLinearRegressionUnknownVariance`
 
+.. raw:: html
+
+ <h2>Naive Bayes Classifier</h2>
+
+The naive Bayes model assumes the features of a training example
+:math:`\mathbf{x}` are mutually independent given the example label :math:`y`:
+
+.. math::
+
+ P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i)
+
+where :math:`M` is the rank of the :math:`i^{th}` example :math:`\mathbf{x}_i`
+and :math:`y_i` is the label associated with the :math:`i^{th}` example.
+
+Combining this conditional independence assumption with a simple application of
+Bayes' theorem gives the naive Bayes classification rule:
+
+.. math::
+
+ \hat{y} &= \arg \max_y P(y \mid \mathbf{x}) \\
+ &= \arg \max_y P(y) P(\mathbf{x} \mid y) \\
+ &= \arg \max_y P(y) \prod_{j=1}^M P(x_j \mid y)
+
+The prior class probability :math:`P(y)` can be specified in advance or
+estimated empirically from the training data.
+
+**Models**
+
+- :class:`~numpy_ml.linear_models.GaussianNBClassifier`
+
+.. raw:: html
+
+ <h2>Generalized Linear Model</h2>
+
+The generalized linear model (GLM) assumes that each target/dependent variable
+:math:`y_i` in target vector :math:`\mathbf{y} = (y_1, \ldots, y_n)`, has been
+drawn independently from a pre-specified distribution in the exponential family
+with unknown mean :math:`\mu_i`. The GLM models a (one-to-one, continuous,
+differentiable) function, *g*, of this mean value as a linear combination of
+the model parameters :math:`\mathbf{b}` and observed covariates,
+:math:`\mathbf{x}_i` :
+
+.. math::
+
+ g(\mathbb{E}[y_i \mid \mathbf{x}_i]) =
+ g(\mu_i) = \mathbf{b}^\top \mathbf{x}_i
+
+where *g* is known as the link function. The choice of link function is
+informed by the instance of the exponential family the target is drawn from.
+
+**Models**
+
+- :class:`~numpy_ml.linear_models.GeneralizedLinearModel`
+
 .. toctree::
  :maxdepth: 2
  :hidden:
 
@@ -1,7 +1,7 @@
 """A module for the generalized linear model."""
 import numpy as np
 
-from numpy_ml.linear_models import LinearRegression
+from numpy_ml.linear_models.linear_regression import LinearRegression
 
 eps = np.finfo(float).eps
 
@@ -48,20 +48,20 @@
 class GeneralizedLinearModel:
  def __init__(self, link, fit_intercept=True, tol=1e-5, max_iter=100):
  r"""
- A generalized linear model [1]_ [2]_ with maximum likelihood fit via
- iteratively reweighted least squares (IRLS) [3]_.
+ A generalized linear model with maximum likelihood fit via
+ iteratively reweighted least squares (IRLS).
 
  Notes
  -----
- The generalized linear model (GLM) assumes that each target/dependent
+ The generalized linear model (GLM) [a]_ [b]_ assumes that each target/dependent
  variable :math:`y_i` in target vector :math:`\mathbf{y} = (y_1, \ldots,
  y_n)`, has been drawn independently from a pre-specified distribution
- in the exponential family [5]_ with unknown mean :math:`\mu_i`. The GLM
+ in the exponential family [e]_ with unknown mean :math:`\mu_i`. The GLM
  models a (one-to-one, continuous, differentiable) function, *g*, of
  this mean value as a linear combination of the model parameters
  :math:`\mathbf{b}` and observed covariates, :math:`\mathbf{x}_i`:
 
- .. math:
+ .. math::
 
  g(\mathbb{E}[y_i \mid \mathbf{x}_i]) =
  g(\mu_i) = \mathbf{b}^\top \mathbf{x}_i
@@ -70,31 +70,31 @@ def __init__(self, link, fit_intercept=True, tol=1e-5, max_iter=100):
  choice of link function is informed by the instance of the exponential
  family the target is drawn from. Common examples:
 
- .. csv-table:: Distributions and their canonical link functions
- :header: "Distribution", "Link Name", "Description"
- :widths: auto
+ .. csv-table::
+ :header: "Distribution", "Link", "Formula"
+ :widths: 25, 20, 30
 
  "Normal", "Identity", ":math:`g(x) = x`"
  "Bernoulli", "Logit", ":math:`g(x) = \log(x) - \log(1 - x)`"
  "Binomial", "Logit", ":math:`g(x) = \log(x) - \log(n - x)`"
  "Poisson", "Log", ":math:`g(x) = \log(x)`"
 
- An iteratively re-weighted least squares (IRLS) algorithm [3]_ can be
+ An iteratively re-weighted least squares (IRLS) algorithm [c]_ can be
  employed to find the maximum likelihood estimate for the model
  parameters :math:`\beta` in any instance of the generalized linear
- model. IRLS is equivalent to Fisher scoring [1]_ [4]_, which itself is
+ model. IRLS is equivalent to Fisher scoring [d]_, which itself is
  a slight modification of classic Newton-Raphson for finding the zeros
  of the first derivative of the model log-likelihood.
 
  References
  ----------
- .. [1] Nelder, J., & Wedderburn, R. (1972). "Generalized linear
- models," _Journal of the Royal Statistical Society, Series A
- (General),_ 135(3): 370–384.
- .. [2] https://en.wikipedia.org/wiki/Generalized_linear_model
- .. [3] https://en.wikipedia.org/wiki/Iteratively_reweighted_least_squares
- .. [4] https://en.wikipedia.org/wiki/Scoring_algorithm
- .. [5] https://en.wikipedia.org/wiki/Exponential_family
+ .. [a] Nelder, J., & Wedderburn, R. (1972). Generalized linear
+ models. *Journal of the Royal Statistical Society, Series A
+ (General), 135(3)*: 370–384.
+ .. [b] https://en.wikipedia.org/wiki/Generalized_linear_model
+ .. [c] https://en.wikipedia.org/wiki/Iteratively_reweighted_least_squares
+ .. [d] https://en.wikipedia.org/wiki/Scoring_algorithm
+ .. [e] https://en.wikipedia.org/wiki/Exponential_family
 
  Parameters
  ----------
 
@@ -66,7 +66,7 @@ def update(self, X, y, weights=None):
 
  Notes
  -----
- The recursive least-squares algorithm [1]_ [2]_ is used to efficiently
+ The recursive least-squares algorithm [3]_ [4]_ is used to efficiently
  update the regression parameters as new examples become available. For
  a single new example :math:`(\mathbf{x}_{t+1}, \mathbf{y}_{t+1})`, the
  parameter updates are
@@ -84,20 +84,20 @@ def update(self, X, y, weights=None):
  examples observed from timestep 1 to *t*.
 
  In the single-example case, the RLS algorithm uses the Sherman-Morrison
- formula [3]_ to avoid re-inverting the covariance matrix on each new
+ formula [5]_ to avoid re-inverting the covariance matrix on each new
  update. In the multi-example case (i.e., where :math:`\mathbf{X}_{t+1}`
  and :math:`\mathbf{y}_{t+1}` are matrices of `N` examples each), we use
- the generalized Woodbury matrix identity [4]_ to update the inverse
+ the generalized Woodbury matrix identity [6]_ to update the inverse
  covariance. This comes at a performance cost, but is still more
  performant than doing multiple single-example updates if *N* is large.
 
  References
  ----------
- .. [1] Gauss, C. F. (1821) _Theoria combinationis observationum
- erroribus minimis obnoxiae_, Werke, 4. Gottinge
- .. [2] https://en.wikipedia.org/wiki/Recursive_least_squares_filter
- .. [3] https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
- .. [4] https://en.wikipedia.org/wiki/Woodbury_matrix_identity
+ .. [3] Gauss, C. F. (1821) *Theoria combinationis observationum
+ erroribus minimis obnoxiae*, Werke, 4. Gottinge
+ .. [4] https://en.wikipedia.org/wiki/Recursive_least_squares_filter
+ .. [5] https://en.wikipedia.org/wiki/Sherman%E2%80%93Morrison_formula
+ .. [6] https://en.wikipedia.org/wiki/Woodbury_matrix_identity
 
  Parameters
  ----------
 
@@ -1,3 +1,4 @@
+"""A module for naive Bayes classifiers"""
 import numpy as np
 
 
@@ -10,14 +11,15 @@ def __init__(self, eps=1e-6):
  -----
  The naive Bayes model assumes the features of each training example
  :math:`\mathbf{x}` are mutually independent given the example label
- :math:`y`:
+ *y*:
 
  .. math::
 
  P(\mathbf{x}_i \mid y_i) = \prod_{j=1}^M P(x_{i,j} \mid y_i)
 
- where :math:`M` is the rank of the `i`th example :math:`\mathbf{x}_i`
- and :math:`y_i` is the label associated with the `i`th example.
+ where :math:`M` is the rank of the :math:`i^{th}` example
+ :math:`\mathbf{x}_i` and :math:`y_i` is the label associated with the
+ :math:`i^{th}` example.
 
  Combining the conditional independence assumption with a simple
  application of Bayes' theorem gives the naive Bayes classification
@@ -186,7 +188,6 @@ def _log_class_posterior(self, X, class_idx):
 
  \mathbf{x}_i \mid y_i = c, \theta \sim \mathcal{N}(\mu_c, \Sigma_c)
 
-
  Parameters
  ----------
  X: :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)`