C语言Logistic回归SGD实现在观测数超约43500时失效的排查求助
C语言Logistic回归SGD实现在观测数超约43500时失效的排查求助
我最近在写C语言版的逻辑回归(用随机梯度下降SGD训练),遇到了一个头疼的问题:在小的半随机数据集上运行完全正常,但当观测数超过大概43500时,模型就彻底失效了——比如设置m=50000时,最终输出的Percent class 1: 0.000000,但把m降到43500以下,就能正常返回Percent class 1: 0.250000,不管迭代次数max_iter设多少都是这个情况。
我生成的150个特征里,前两个是和观测数挂钩的:
- 第一个特征:如果是类别0(前75%的观测),值为
1.0 / m;如果是类别1,值为(double)i / m(i是当前观测的索引) - 第二个特征:所有观测都是
(double)i / m - 剩下的特征都是随机生成的
我用的是双精度类型,不确定是不是特征生成的问题,或者代码里某个地方出现了溢出?
下面是完整可运行的C代码,它会生成m=50000个观测、n=150个特征。另外我用Python+NumPy实现了逻辑完全等价的代码,就算观测数超过50000也能得到正确结果,放在后面供参考。
C语言实现代码
#include <stdlib.h> #include <stdio.h> #include <math.h> #include <time.h> // Compute z = w * x + b double dlc( int n, double *X, double *coef, double intercept ) { double y_pred = intercept; for (int i = 0; i < n; i++) { y_pred += X[i] * coef[i]; } return y_pred; } // Compute y_hat = 1 / (1 + e^(-z)) double sigmoid( int n, double alpha, double *X, double *coef, double beta, double intercept ) { double y_pred; y_pred = dlc(n, X, coef, intercept); y_pred = 1.0 / (1.0 + exp(-y_pred)); return y_pred; } // Stochastic gradient descent void sgd( int m, int n, double *X, double *y, double *coef, double *intercept, double eta, int max_iter, int fit_intercept, int random_seed ) { double *gradient_coef, *X_i; double y_i, y_pred, resid; int idx; double gradient_intercept = 0.0, alpha = 1.0, beta = 1.0; X_i = (double *) malloc (n * sizeof(double)); gradient_coef = (double *) malloc (n * sizeof(double)); for ( int i = 0; i < n; i++ ) { coef[i] = 0.0; gradient_coef[i] = 0.0; } *intercept = 0.0; srand(random_seed); for ( int epoch = 0; epoch < max_iter; epoch++ ) { for ( int run = 0; run < m; run++ ) { // Randomly sample an observation idx = rand() % m; for ( int i = 0; i < n; i++ ) { X_i[i] = X[n*idx+i]; } y_i = y[idx]; // Compute y_hat y_pred = sigmoid( n, alpha, X_i, coef, beta, *intercept ); resid = -(y_i - y_pred); // Compute gradients and adjust weights for (int i = 0; i < n; i++) { gradient_coef[i] = X_i[i] * resid; coef[i] -= eta * gradient_coef[i]; } if ( fit_intercept == 1 ) { *intercept -= eta * resid; } } } } int main(void) { double *X, *y, *coef, *y_pred; double intercept; double eta = 0.05; double alpha = 1.0, beta = 1.0; long m = 50000; long n = 150; int max_iter = 20; long class_0 = (long)(3.0 / 4.0 * (double)m); double pct_class_1 = 0.0; clock_t test_start; clock_t test_end; double test_time; printf("Constructing variables...\n"); X = (double *) malloc (m * n * sizeof(double)); y = (double *) malloc (m * sizeof(double)); y_pred = (double *) malloc (m * sizeof(double)); coef = (double *) malloc (n * sizeof(double)); // Initialize classes for (int i = 0; i < m; i++) { if (i < class_0) { y[i] = 0.0; } else { y[i] = 1.0; } } // Initialize observation features for (int i = 0; i < m; i++) { if (i < class_0) { X[n*i] = 1.0 / (double)m; } else { X[n*i] = (double)i / (double)m; } X[n*i + 1] = (double)i / (double)m; for (int j = 2; j < n; j++) { X[n*i + j] = (double)(rand() % 100) / 100.0; } } // Fit weights printf("Running SGD...\n"); test_start = clock(); sgd( m, n, X, y, coef, &intercept, eta, max_iter, 1, 42 ); test_end = clock(); test_time = (double)(test_end - test_start) / CLOCKS_PER_SEC; printf("Time taken: %f\n", test_time); // Compute y_hat and share of observations predicted as class 1 printf("Making predictions...\n"); for ( int i = 0; i < m; i++ ) { y_pred[i] = sigmoid( n, alpha, &X[i*n], coef, beta, intercept ); } printf("Printing results...\n"); for ( int i = 0; i < m; i++ ) { //printf("%f\n", y_pred[i]); if (y_pred[i] > 0.5) { pct_class_1 += 1.0; } // Troubleshooting print if (i < 10 || i > m - 10) { printf("%g\n", y_pred[i]); } } printf("Percent class 1: %f", pct_class_1 / (double)m); return 0; }
参考Python实现代码
import numpy as np import time def sigmoid(x): return 1 / (1 + np.exp(-x)) class LogisticRegressor: def __init__(self, eta, init_runs, fit_intercept=True): self.eta = eta self.init_runs = init_runs self.fit_intercept = fit_intercept def fit(self, x, y): m, n = x.shape self.coef = np.zeros((n, 1)) self.intercept = np.zeros((1, 1)) for epoch in range(self.init_runs): for run in range(m): idx = np.random.randint(0, m) x_i = x[idx:idx+1, :] y_i = y[idx] y_pred_i = sigmoid(x_i.dot(self.coef) + self.intercept) gradient_w = -(x_i.T * (y_i - y_pred_i)) self.coef -= self.eta * gradient_w if self.fit_intercept: gradient_b = -(y_i - y_pred_i) self.intercept -= self.eta * gradient_b def predict_proba(self, x): m, n = x.shape y_pred = np.ones((m, 2)) y_pred[:,1:2] = sigmoid(x.dot(self.coef) + self.intercept) y_pred[:,0:1] -= y_pred[:,1:2] return y_pred def predict(self, x): return np.round(sigmoid(x.dot(self.coef) + self.intercept)) m = 50000 n = 150 class1 = int(3.0 / 4.0 * m) X = np.random.rand(m, n) y = np.zeros((m, 1)) for obs in range(m): if obs < class1: continue else: y[obs,0] = 1 for obs in range(m): if obs < class1: X[obs, 0] = 1.0 / float(m) else: X[obs, 0] = float(obs) / float(m) X[obs, 1] = float(obs) / float(m) logit = LogisticRegressor(0.05, 20) start_time = time.time() logit.fit(X, y) end_time = time.time() print(round(end_time - start_time, 2)) y_pred = logit.predict(X) print("Percent:", y_pred.sum() / len(y_pred))
备注:内容来源于stack exchange,提问作者hillard28




