TensorFlow全连接神经网络准确率约0.75但Loss无法低于0.5的优化及特征相关问题咨询
TensorFlow全连接神经网络准确率约0.75但Loss无法低于0.5的优化及特征相关问题咨询
我用Python实现了一个全连接神经网络(FCNN),数据集包含约5000场足球比赛,用到的特征有射门、角球、黄牌、犯规、创造绝佳机会、传球、传中和扑救数。
预测目标是二分类:我把比赛结果(主胜/客胜/平局)处理成了二分类标签。模型输出层用了1个神经元+sigmoid激活函数,损失函数选的是binary_crossentropy,还加了Dropout和早停机制。
现在训练集和验证集的准确率都在0.75左右,看起来还不错,但两者的Loss始终降不到0.5以下,这应该不太理想吧?有没有什么优化方法能让Loss降下来?
编辑补充:我后来加了两个新特征——主队和客队的射门转化率(进球数/射门数),现在Loss直接降到了0.1甚至更低!
不过有个问题:用随机森林做特征重要性分析时,这两个新特征的权重远远高于其他所有特征,这让我有点担心是不是哪里不对。

以下是我完整的实现代码:
import pandas as pd import numpy as np import matplotlib.pyplot as plt import os import sys import joblib from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.ensemble import RandomForestClassifier from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Dropout from tensorflow.keras.optimizers import Adam from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau from tensorflow.keras.regularizers import l2 from tensorflow.keras.layers import BatchNormalization from tensorflow.keras.callbacks import LearningRateScheduler from sklearn.preprocessing import MinMaxScaler from datetime import datetime from tensorflow.keras.utils import to_categorical if len(sys.argv) == 4: file_name = sys.argv[1] pattern = sys.argv[2] network_path = sys.argv[3] else: print("[ERROR] Incorrect parameters") quit # Step 1: Load the dataset df = pd.read_csv(file_name) print("Available columns:", df.columns) # Remove corrupted lines df = df[df['MatchId'] != ""] # Aggregate shots in just 2 columns, for home and away df['HomeTeamShots'] = df['HomeTeamShotsOnTarget'] + df['HomeTeamShotsOffTarget'] + df['HomeTeamBlockedShots'] df['AwayTeamShots'] = df['AwayTeamShotsOnTarget'] + df['AwayTeamShotsOffTarget'] + df['AwayTeamBlockedShots'] # Shot conversion rate [newly added] # df['HomeTeamConversionRate'] = 0 # Default value for conversion rate # df.loc[df['HomeTeamShots'] > 0, 'HomeTeamConversionRate'] = df['HomeTeamFTGoal'] / df['HomeTeamShots'] # df['AwayTeamConversionRate'] = 0 # Default value for conversion rate # df.loc[df['AwayTeamShots'] > 0, 'AwayTeamConversionRate'] = df['AwayTeamFTGoal'] / df['AwayTeamShots'] # Home wins if pattern == "HW": df['Outcome'] = df.apply( lambda row: 0 if row['HomeTeamFTGoal'] > row['AwayTeamFTGoal'] \ else 1, axis=1 ) # Home wins or draws elif pattern == "HWD": df['Outcome'] = df.apply( lambda row: 0 if row['HomeTeamFTGoal'] >= row['AwayTeamFTGoal'] \ else 1, axis=1 ) # Away wins elif pattern == "AW": df['Outcome'] = df.apply( lambda row: 0 if row['HomeTeamFTGoal'] < row['AwayTeamFTGoal'] \ else 1, axis=1 ) # Over 2.5 goals elif pattern == "OV25": df['Outcome'] = df.apply( lambda row: 0 if row['HomeTeamFTGoal'] + row['AwayTeamFTGoal'] > 2 \ else 1, axis=1 ) # BTTS elif pattern == "BTTS": df['Outcome'] = df.apply( lambda row: 0 if row['HomeTeamFTGoal'] > 0 and row['AwayTeamFTGoal'] > 0 \ else 1, axis=1 ) # Over 3.5 goals elif pattern == "OV35": df['Outcome'] = df.apply( lambda row: 0 if row['HomeTeamFTGoal'] + row['AwayTeamFTGoal'] > 3 \ else 1, axis=1 ) # Draw elif pattern == "DRAW": df['Outcome'] = df.apply( lambda row: 0 if row['HomeTeamFTGoal'] == row['AwayTeamFTGoal'] \ else 1, axis=1 ) # Home over 0.5 elif pattern == "HOV05": df['Outcome'] = df.apply( lambda row: 0 if row['HomeTeamFTGoal'] > 0 \ else 1, axis=1 ) # Away over 0.5 elif pattern == "AOV05": df['Outcome'] = df.apply( lambda row: 0 if row['AwayTeamFTGoal'] > 0 \ else 1, axis=1 ) # Home over 0.5 elif pattern == "HOV15": df['Outcome'] = df.apply( lambda row: 0 if row['HomeTeamFTGoal'] > 1 \ else 1, axis=1 ) # Away over 0.5 elif pattern == "AOV15": df['Outcome'] = df.apply( lambda row: 0 if row['AwayTeamFTGoal'] > 1 \ else 1, axis=1 ) # Drop not relevant columns df = df.drop(columns=['HomeTeamID', 'AwayTeamID', 'MatchId', 'HomeTeamFTGoal', 'AwayTeamFTGoal', 'HomeTeamName', 'AwayTeamName', 'MatchDate', 'HomeTeamxG', 'AwayTeamxG', 'HomeTeamBlockedShots', 'AwayTeamBlockedShots', 'HomeTeamShotsInsideBox', 'AwayTeamShotsInsideBox', 'HomeTeamShotsOutsideBox','AwayTeamShotsOutsideBox', 'HomeTeamShotsOnTarget', 'AwayTeamShotsOnTarget', 'HomeTeamShotsOffTarget', 'AwayTeamShotsOffTarget' ]) # Split features and target X = df.drop(columns=['Outcome']).values # Features y = df['Outcome'].values # Target # Encode target variable as categorical # y_one_hot = to_categorical(y) y_one_hot = y # Split training and testing datasets X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42) # and scale scaler = MinMaxScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) ############### Random forest for feature evaluation ############### # Store column names before scaling X = pd.DataFrame(X) feature_names = df.drop(columns=['Outcome']).columns X_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names) rf_model = RandomForestClassifier(n_estimators=100, random_state=42) rf_model.fit(X_train_scaled, y_train) # Get feature importances importances_rf = rf_model.feature_importances_ # Print or plot the feature importances print("Feature Importances from Random Forest:") for feature, importance in zip(feature_names, importances_rf): print(f"{feature}: {importance}") # Optionally, plot the importances plt.figure(figsize=(10, 6)) plt.barh(feature_names, importances_rf) plt.xlabel('Feature Importance') plt.title('Feature Importance from Random Forest') plt.show() ############### Random forest for feature evaluation ############### ############### Models ############### model = Sequential([ Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)), BatchNormalization(), # Dropout(0.6), # Increased dropout to regularize more Dropout(0.2), # Increased dropout to regularize more Dense(32, activation='relu', kernel_regularizer=l2(0.01)), BatchNormalization(), # Dropout(0.5), Dropout(0.2), Dense(32, activation='relu'), # Dense(32, activation='relu'), Dense(1, activation='sigmoid') # Output layer for multi-class classification ]) model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy']) # Implement early stopping and learning rate reduction early_stopping = EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True) reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=3, min_lr=1e-6, verbose=1) history = model.fit( X_train_scaled, y_train, validation_data=(X_test_scaled, y_test), epochs=50, batch_size=32, callbacks=[early_stopping, reduce_lr], verbose=1 ) # Plot accuracy and loss plt.figure(figsize=(10, 6)) plt.plot(history.history['loss'], label='Training Loss', color='blue') plt.plot(history.history['val_loss'], label='Validation Loss', color='orange') plt.title('Training and Validation Loss Over Epochs', fontsize=16) plt.xlabel('Epochs', fontsize=14) plt.ylabel('Loss', fontsize=14) plt.legend(fontsize=12) plt.grid(True) plt.show() predictions = model.predict(X_test_scaled) plt.hist(predictions, bins=50) plt.title("Distribution of Predicted Probabilities") plt.show() # Plot training and validation accuracy import matplotlib.pyplot as plt plt.plot(history.history['accuracy'], label='Training Accuracy') plt.plot(history.history['val_accuracy'], label='Validation Accuracy') plt.title('Training and Validation Accuracy') plt.xlabel('Epochs') plt.ylabel('Accuracy') plt.legend() plt.show() # Save the model to a file timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") filename = network_path + "\\" + pattern + "_Model_" + timestamp + ".h5" print(f"Network path: {filename}") print(network_path) model.save(filename)
备注:内容来源于Stack Exchange,提问作者rcolombari




