Newton Rhapson

In numerical analysis, Newton’s method (also known as the Newton–Raphson method), named after Isaac Newton and Joseph Raphson, is a method for finding successively better approximations to the roots (or zeroes) of a real-valued function. It is one example of a root-finding algorithm.

The Newton–Raphson method in one variable is implemented as follows:

The method starts with a function f defined over the real numbers x, the function’s derivative f ′, and an initial guess x0 for a root of the function f. If the function satisfies the assumptions made in the derivation of the formula and the initial guess is close, then a better approximation x1 is

\[{\displaystyle x_{1}=x_{0}-{\frac {f(x_{0})}{f'(x_{0})}}\,.}\] Geometrically, (x1, 0) is the intersection of the x-axis and the tangent of the graph of f at (x0, f (x0)).

The process is repeated as

\[{\displaystyle x_{n+1}=x_{n}-{\frac {f(x_{n})}{f'(x_{n})}}\,} \] until a sufficiently accurate value is reached.

Calculating alpha, beta from a given Gamma distribution using NR algorithm on alpha

Below I have implemented the Newton Rhapson method to calculate the parameters for a gamma distribution. The function ‘f’ here is the likelihood function. The variable ‘x’ is the term alpha in the gamma distribution. The Gamma distribution requires two parameters (alpha, beta) but in order to approximate the parameter alpha, the value of beta is calculated through method of moments, after every iteration.

gamma.alpha.mle <- function (x, tolerance = 1e-10, max.iterations = 25) {
  # sufficient stats:
  xbar <- mean(x); logxbar <- mean(log(x)); n <- length(x)
  rhs <- log(xbar) - logxbar
  # init
  alpha <- xbar ^ 2 / var(x);
  beta <- alpha / xbar # method of moments
  lhood <- n * (alpha * log(beta) - lgamma(alpha) +
                  (alpha - 1) * logxbar - beta * xbar)
  
  # iterate
  for (it in 1:max.iterations) {
    Dl <- n * (log(alpha) - digamma(alpha) - rhs) # dl / dalpha
    D2l <- -n * (1 / alpha - trigamma(alpha)) # -d^2 / dalpha^2
    
    alpha.new <- alpha + Dl / D2l
    beta <- alpha.new / xbar
    
    lhood.new <- n * (alpha.new * log(beta) - lgamma(alpha.new) +
                        (alpha.new - 1) * logxbar - beta * xbar)
    message("[", it, "] alpha = ", alpha.new, " | lhood = ", lhood.new)
    
    if (abs((lhood.new - lhood) / lhood) < tolerance) break #required precision
    alpha <- alpha.new; lhood <- lhood.new
  }
  list(alpha = alpha, beta = beta, vcov = 1 / D2l)
}

##  Example

n <- 100
x <- rgamma(n, 2.5, 1) 

res1 <- gamma.alpha.mle(x)
## [1] alpha = 2.91134537676027 | lhood = -172.525795855384
## [2] alpha = 2.91142027517776 | lhood = -172.525795836985
## [3] alpha = 2.91142027719767 | lhood = -172.525795836984
res1
## $alpha
## [1] 2.91142
## 
## $beta
## [1] 1.108889
## 
## $vcov
## [1] 0.1524543

Calculating alpha, beta using bivariate Newton Rhapson on both alpha and beta

Below I have implemented the approximation in which in every iteration, we will update both alpha and beta. Hence we will use the Hessian of the 2D matrix. We are using the cholesky decompostion of the Hessian matrix here.

chol.solve <- function (C, x) backsolve(C, backsolve(C, x, transpose = TRUE))

gamma.mle <- function (x, tolerance = 1e-10, max.iterations = 25) {
  # sufficient stats:
  xbar <- mean(x); logxbar <- mean(log(x)); n <- length(x)
  # init
  theta <- numeric(2)
  theta[1] <- xbar ^ 2 / var(x) # alpha
  theta[2] <- theta[1] / xbar # beta
  lhood <- n * (theta[1] * log(theta[2]) - lgamma(theta[1]) +
                  (theta[1] - 1) * logxbar - theta[2] * xbar)
  Dl <- numeric(2); D2l <- matrix(nrow = 2, ncol = 2) # pre-allocate
  # iterate
  for (it in 1:max.iterations) {
    Dl[1] <- n * (log(theta[2]) - digamma(theta[1]) + logxbar) # dl / dalpha
    Dl[2] <- n * (theta[1] / theta[2] - xbar) # dl / dbeta
    
    D2l[1, 1] <- n * trigamma(theta[1]) # -d^2 l / dalpha^2
    D2l[1, 2] <- D2l[2, 1] <- -n / theta[2] # -d^2 l / (dalpha dbeta)
    D2l[2, 2] <- n * theta[1] / theta[2] ^ 2 # -d^2 l / dbeta^2
    C <- chol(D2l)
    
    theta.new <- theta + chol.solve(C, Dl)
    lhood.new <- n * (theta.new[1] * log(theta.new[2]) - lgamma(theta.new[1]) +
                        (theta.new[1] - 1) * logxbar - theta.new[2] * xbar)
    message("[", it, "] alpha = ", theta.new[1], " | lhood = ", lhood.new)
    if (abs((lhood.new - lhood) / lhood) < tolerance) break
    theta <- theta.new; lhood <- lhood.new
  }
  list(alpha = theta[1], beta = theta[2], vcov = chol2inv(C))
}


#Example

n <- 500
x <- rgamma(n, 5, 3) 

res2 <- gamma.mle(x)
## [1] alpha = 5.1427205099826 | lhood = -519.8682177595
## [2] alpha = 5.15385757836242 | lhood = -519.867592888983
## [3] alpha = 5.15388247066543 | lhood = -519.86759288588
res2
## $alpha
## [1] 5.153858
## 
## $beta
## [1] 3.099349
## 
## $vcov
##            [,1]       [,2]
## [1,] 0.09983913 0.06003974
## [2,] 0.06003974 0.03983346
# Mean of the distribution 
mu <- res2$alpha / res2$beta


#Comparison with glm function in r
fit <- glm(x ~ 1, family = Gamma)
MASS::gamma.shape(fit)
##                 
## Alpha: 5.1538825
## SE:    0.3159737
c(res2$alpha, sqrt(res1$vcov), sqrt(res2$vcov[1,1]))
## [1] 5.1538576 0.3904540 0.3159733
# Coefficients from both the methods are same

Using Newton Rhapson method for calculating Betas in Logistic Regression

# [ Logistic regression: y_i ~ Bern(inv.logit(x_i' * beta)) ]
logit <- function (x) log(x / (1 - x)) 
inv.logit <- function (x) 1 / (1 + exp(-x))

fit.logistic <- function (y, X, tolerance = 1e-10, max.iterations = 10) {
  beta <- coef(lm(logit((y + .5) / 2) ~ X - 1))
  mu <- inv.logit(drop(X %*% beta))
  lhood <- sum(log(y * mu + (1 - y) * (1 - mu)))
  for (it in 1:max.iterations) {
    Dl <- crossprod(X, y - mu)
    D2l <- crossprod(X, mu * (1 - mu) * X) # negative Hessian
    C <- chol(D2l)
    beta.new <- beta + chol.solve(C, Dl)
    mu <- inv.logit(drop(X %*% beta.new))
    lhood.new <- sum(log(y * mu + (1 - y) * (1 - mu)))
    message("[", it, "] lhood = ", lhood.new)
    if (abs((lhood.new - lhood) / lhood.new) < tolerance) break
    beta <- beta.new; lhood <- lhood.new
  }
  list(beta = beta, vcov = chol2inv(C))
}

# Example
y <- rbinom(n, 1, inv.logit(1 - 2 * x))
X <- model.matrix(~ x)
(res <- fit.logistic(y, X))
## [1] lhood = -187.113480327674
## [2] lhood = -179.192228587408
## [3] lhood = -178.351909850074
## [4] lhood = -178.338506960706
## [5] lhood = -178.33850281942
## [6] lhood = -178.33850281942
## $beta
##            [,1]
## [1,]  0.6406201
## [2,] -1.7206999
## 
## $vcov
##             [,1]        [,2]
## [1,]  0.12838010 -0.09035378
## [2,] -0.09035378  0.07414376
beta0 <- coef(lm(logit((y + .5) / 2) ~ X - 1))
(fit <- glm(y ~ x, family = binomial, start = beta0,
           control = list(epsilon = 1e-10, trace = T)))
## Deviance = 374.227 Iterations - 1
## Deviance = 358.3845 Iterations - 2
## Deviance = 356.7038 Iterations - 3
## Deviance = 356.677 Iterations - 4
## Deviance = 356.677 Iterations - 5
## Deviance = 356.677 Iterations - 6
## 
## Call:  glm(formula = y ~ x, family = binomial, start = beta0, control = list(epsilon = 1e-10, 
##     trace = T))
## 
## Coefficients:
## (Intercept)            x  
##      0.6406      -1.7207  
## 
## Degrees of Freedom: 499 Total (i.e. Null);  498 Residual
## Null Deviance:       412.2 
## Residual Deviance: 356.7     AIC: 360.7
summary(fit)$cov.scaled # compare to res$vcov
##             (Intercept)           x
## (Intercept)  0.12838010 -0.09035378
## x           -0.09035378  0.07414376