You need to enable JavaScript to run this app.
最新活动
大模型
产品
解决方案
定价
生态与合作
支持与服务
开发者
了解我们

TensorFlow全连接神经网络准确率约0.75但Loss无法低于0.5的优化及特征相关问题咨询

TensorFlow全连接神经网络准确率约0.75但Loss无法低于0.5的优化及特征相关问题咨询

我用Python实现了一个全连接神经网络(FCNN),数据集包含约5000场足球比赛,用到的特征有射门、角球、黄牌、犯规、创造绝佳机会、传球、传中和扑救数。

预测目标是二分类:我把比赛结果(主胜/客胜/平局)处理成了二分类标签。模型输出层用了1个神经元+sigmoid激活函数,损失函数选的是binary_crossentropy,还加了Dropout和早停机制。

现在训练集和验证集的准确率都在0.75左右,看起来还不错,但两者的Loss始终降不到0.5以下,这应该不太理想吧?有没有什么优化方法能让Loss降下来?

编辑补充:我后来加了两个新特征——主队和客队的射门转化率(进球数/射门数),现在Loss直接降到了0.1甚至更低!
不过有个问题:用随机森林做特征重要性分析时,这两个新特征的权重远远高于其他所有特征,这让我有点担心是不是哪里不对。

随机森林特征重要性柱状图

以下是我完整的实现代码:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import sys
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.callbacks import LearningRateScheduler
from sklearn.preprocessing import MinMaxScaler
from datetime import datetime
from tensorflow.keras.utils import to_categorical

if len(sys.argv) == 4:
    file_name = sys.argv[1]
    pattern = sys.argv[2]
    network_path = sys.argv[3]
else:
    print("[ERROR] Incorrect parameters")
    quit

# Step 1: Load the dataset
df = pd.read_csv(file_name)

print("Available columns:", df.columns)

# Remove corrupted lines
df = df[df['MatchId'] != ""]

# Aggregate shots in just 2 columns, for home and away
df['HomeTeamShots'] = df['HomeTeamShotsOnTarget'] + df['HomeTeamShotsOffTarget'] + df['HomeTeamBlockedShots']
df['AwayTeamShots'] = df['AwayTeamShotsOnTarget'] + df['AwayTeamShotsOffTarget'] + df['AwayTeamBlockedShots']

# Shot conversion rate [newly added]
# df['HomeTeamConversionRate'] = 0  # Default value for conversion rate
# df.loc[df['HomeTeamShots'] > 0, 'HomeTeamConversionRate'] = df['HomeTeamFTGoal'] / df['HomeTeamShots']

# df['AwayTeamConversionRate'] = 0  # Default value for conversion rate
# df.loc[df['AwayTeamShots'] > 0, 'AwayTeamConversionRate'] = df['AwayTeamFTGoal'] / df['AwayTeamShots']

# Home wins
if pattern == "HW":
    df['Outcome'] = df.apply(
    lambda row: 0 if row['HomeTeamFTGoal'] > row['AwayTeamFTGoal'] \
    else 1,
    axis=1
)
# Home wins or draws
elif pattern == "HWD":
    df['Outcome'] = df.apply(
        lambda row: 0 if row['HomeTeamFTGoal'] >= row['AwayTeamFTGoal'] \
        else 1,
        axis=1
    )
# Away wins
elif pattern == "AW":
    df['Outcome'] = df.apply(
        lambda row: 0 if row['HomeTeamFTGoal'] < row['AwayTeamFTGoal'] \
        else 1,
        axis=1
    )
# Over 2.5 goals
elif pattern == "OV25":
    df['Outcome'] = df.apply(
        lambda row: 0 if row['HomeTeamFTGoal'] + row['AwayTeamFTGoal'] > 2 \
        else 1,
        axis=1
    )
# BTTS
elif pattern == "BTTS":
    df['Outcome'] = df.apply(
        lambda row: 0 if row['HomeTeamFTGoal'] > 0 and row['AwayTeamFTGoal'] > 0 \
        else 1,
        axis=1
    )
# Over 3.5 goals
elif pattern == "OV35":
    df['Outcome'] = df.apply(
        lambda row: 0 if row['HomeTeamFTGoal'] + row['AwayTeamFTGoal'] > 3 \
        else 1,
        axis=1
    )    
# Draw
elif pattern == "DRAW":
    df['Outcome'] = df.apply(
        lambda row: 0 if row['HomeTeamFTGoal'] == row['AwayTeamFTGoal'] \
        else 1,
        axis=1
    )    
# Home over 0.5
elif pattern == "HOV05":
    df['Outcome'] = df.apply(
        lambda row: 0 if row['HomeTeamFTGoal'] > 0 \
        else 1,
        axis=1
    )    
# Away over 0.5
elif pattern == "AOV05":
    df['Outcome'] = df.apply(
        lambda row: 0 if row['AwayTeamFTGoal'] > 0 \
        else 1,
        axis=1
    )    
# Home over 0.5
elif pattern == "HOV15":
    df['Outcome'] = df.apply(
        lambda row: 0 if row['HomeTeamFTGoal'] > 1 \
        else 1,
        axis=1
    )    
# Away over 0.5
elif pattern == "AOV15":
    df['Outcome'] = df.apply(
        lambda row: 0 if row['AwayTeamFTGoal'] > 1 \
        else 1,
        axis=1
    )    

# Drop not relevant columns
df = df.drop(columns=['HomeTeamID', 'AwayTeamID', 'MatchId', 'HomeTeamFTGoal', 'AwayTeamFTGoal', 'HomeTeamName', 'AwayTeamName', 'MatchDate', 'HomeTeamxG', 'AwayTeamxG',
                    'HomeTeamBlockedShots', 'AwayTeamBlockedShots', 
                    'HomeTeamShotsInsideBox', 'AwayTeamShotsInsideBox', 'HomeTeamShotsOutsideBox','AwayTeamShotsOutsideBox', 'HomeTeamShotsOnTarget', 'AwayTeamShotsOnTarget',
                     'HomeTeamShotsOffTarget', 'AwayTeamShotsOffTarget' ])

# Split features and target
X = df.drop(columns=['Outcome']).values  # Features
y = df['Outcome'].values  # Target

# Encode target variable as categorical
# y_one_hot = to_categorical(y)
y_one_hot = y

# Split training and testing datasets
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)
# and scale
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

############### Random forest for feature evaluation ###############

# Store column names before scaling
X = pd.DataFrame(X)
feature_names = df.drop(columns=['Outcome']).columns

X_scaled_df = pd.DataFrame(X_train_scaled, columns=feature_names)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Get feature importances
importances_rf = rf_model.feature_importances_

# Print or plot the feature importances
print("Feature Importances from Random Forest:")
for feature, importance in zip(feature_names, importances_rf):
    print(f"{feature}: {importance}")

# Optionally, plot the importances
plt.figure(figsize=(10, 6))
plt.barh(feature_names, importances_rf)
plt.xlabel('Feature Importance')
plt.title('Feature Importance from Random Forest')
plt.show()

############### Random forest for feature evaluation ###############

############### Models ###############

model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],), kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    # Dropout(0.6),  # Increased dropout to regularize more
    Dropout(0.2),  # Increased dropout to regularize more
    Dense(32, activation='relu', kernel_regularizer=l2(0.01)),
    BatchNormalization(),
    # Dropout(0.5),
    Dropout(0.2),
    Dense(32, activation='relu'),
    # Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')  # Output layer for multi-class classification
])

model.compile(optimizer=Adam(learning_rate=0.0005), loss='binary_crossentropy', metrics=['accuracy'])

# Implement early stopping and learning rate reduction
early_stopping = EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=3, min_lr=1e-6, verbose=1)

history = model.fit(
    X_train_scaled, y_train,
    validation_data=(X_test_scaled, y_test),
    epochs=50,
    batch_size=32,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

# Plot accuracy and loss
plt.figure(figsize=(10, 6))
plt.plot(history.history['loss'], label='Training Loss', color='blue')
plt.plot(history.history['val_loss'], label='Validation Loss', color='orange')
plt.title('Training and Validation Loss Over Epochs', fontsize=16)
plt.xlabel('Epochs', fontsize=14)
plt.ylabel('Loss', fontsize=14)
plt.legend(fontsize=12)
plt.grid(True)
plt.show()

predictions = model.predict(X_test_scaled)
plt.hist(predictions, bins=50)
plt.title("Distribution of Predicted Probabilities")
plt.show()

# Plot training and validation accuracy
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# Save the model to a file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = network_path + "\\" + pattern + "_Model_" + timestamp + ".h5"
print(f"Network path: {filename}")
print(network_path)
model.save(filename)

备注:内容来源于Stack Exchange,提问作者rcolombari

火山引擎 最新活动