import requests
import seaborn as sns
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import time
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import plot_tree


years = list(range(2013,2025))
player_totals_url = "https://www.basketball-reference.com/leagues/NBA_{}_totals.html"
for yr in years:
    url = player_totals_url.format(yr)
    data = requests.get(url)
    with open("totals/{}.html".format(yr), "w+") as f:
        f.write(data.text)


dfs = []
for yr in years:
    with open("totals/{}.html".format(yr)) as f:
        page = f.read()
    soup = BeautifulSoup(page, "html.parser")
    soup.find('tr', class_="full_table").decompose()
    stats_table = soup.find(id="totals_stats")
    players = pd.read_html(str(stats_table))[0]
    dfs.append(players)


dfs[0].head()


columns_to_drop = ['FG%', '3P%', '2P%', 'eFG%', 'ORB', 'DRB', 'PF', 'FT%', 'Rk']

for df in dfs:
    columns_to_drop_existing = [col for col in columns_to_drop if col in df.columns]
    df.drop(columns=columns_to_drop_existing, inplace=True)


dfs[1]


for i in range(len(dfs)):
    dfs[i] = dfs[i].drop(dfs[i][dfs[i]['Player'] == 'Player'].index)


dfs[1]


for i in range(len(dfs)):
    columns_to_convert = ['Age', 'G', 'GS', 'MP', 'FG', 'FGA', '3P', '3PA', '2P', '2PA', 'FT', 'FTA', 'TRB', 'AST', 'STL', 'BLK', 'TOV', 'PTS']
    dfs[i][columns_to_convert] = dfs[i][columns_to_convert].astype(int)


dfs[1]


def concat_teams(teams):
    return ','.join(sorted(set(teams)))

agg_funcs = {
    'Age': 'first',
    'Pos': 'first',
    'G': 'sum',
    'GS': 'sum',
    'MP': 'sum',
    'FG': 'sum',
    'FGA': 'sum',
    '3P': 'sum',
    '3PA': 'sum',
    '2P': 'sum',
    '2PA': 'sum',
    'FT': 'sum',
    'FTA': 'sum',
    'TRB': 'sum',
    'AST': 'sum',
    'STL': 'sum',
    'BLK': 'sum',
    'TOV': 'sum',
    'PTS': 'sum',
    'Tm': concat_teams
}

# List to hold the aggregated dataframes
df_list = []

for df in dfs:
    # Filter out rows where the team is 'TOT'
    df_filtered = df[df['Tm'] != 'TOT']
    # Group by player and apply the aggregation functions
    grouped_df = df_filtered.groupby('Player').agg(agg_funcs).reset_index()
    df_list.append(grouped_df)


def add_percentage_columns(df):
    df['FG%'] = df.apply(lambda row: row['FG'] / row['FGA'] if row['FGA'] > 0 else 0, axis=1)
    df['3P%'] = df.apply(lambda row: row['3P'] / row['3PA'] if row['3PA'] > 0 else 0, axis=1)
    df['2P%'] = df.apply(lambda row: row['2P'] / row['2PA'] if row['2PA'] > 0 else 0, axis=1)
    df['eFG%'] = df.apply(lambda row: (row['FG'] + 0.5 * row['3P']) / row['FGA'] if row['FGA'] > 0 else 0, axis=1)
    df['FT%'] = df.apply(lambda row: row['FT'] / row['FTA'] if row['FTA'] > 0 else 0, axis=1)
    return df
df_list = [add_percentage_columns(df) for df in df_list]


def filter_top_400_by_minutes(df):
    return df.sort_values(by='MP', ascending=False).head(400)

filtered_df_list = [filter_top_400_by_minutes(df) for df in df_list]

filtered_df_list[1].head()


filtered_df_list[2].head()


def add_fantasy_columns(df):
    df['FP_PTS'] = df.apply(lambda row: row['PTS'], axis=1)
    df['FP_3P'] = df.apply(lambda row: row['3P'], axis=1)
    df['FP_FGA'] = df.apply(lambda row: row['FGA'] * -1, axis=1)
    df['FP_FG'] = df.apply(lambda row: row['FG'] * 2, axis=1)
    df['FP_FTA'] = df.apply(lambda row: row['FTA']* -1, axis=1)
    df['FP_FT'] = df.apply(lambda row: row['FT'], axis=1)
    df['FP_TRB'] = df.apply(lambda row: row['TRB'], axis=1)
    df['FP_AST'] = df.apply(lambda row: row['AST'] * 2, axis=1)
    df['FP_STL'] = df.apply(lambda row: row['STL'] * 4, axis=1)
    df['FP_BLK'] = df.apply(lambda row: row['BLK'] * 4, axis=1)
    df['FP_TOV'] = df.apply(lambda row: row['TOV'] * -2, axis=1)
    return df
filtered_df_list = [add_fantasy_columns(df) for df in filtered_df_list]


filtered_df_list[4].head()


def get_total_fp(df):
    df['FP_Total'] = df.apply(lambda row: row['FP_PTS'] + row['FP_3P'] + row['FP_FGA'] + row['FP_FG'] + row['FP_FTA'] + row['FP_FT'] + row['FP_TRB'] + row['FP_AST'] + row['FP_STL'] + row['FP_BLK'] + row['FP_TOV'], axis=1)
    return df
filtered_df_list = [get_total_fp(df) for df in filtered_df_list]


filtered_df_list[5].head()


combined_df = pd.concat(filtered_df_list, ignore_index=True)

average_fp_per_pos = combined_df.groupby('Pos')['FP_Total'].mean().reset_index()

average_fp_per_pos = average_fp_per_pos.sort_values('FP_Total', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Pos', y='FP_Total', data=average_fp_per_pos, palette='viridis')
plt.title('Average Fantasy Points per Player Position (Overall)')
plt.xlabel('Player Position')
plt.ylabel('Average Fantasy Points')
plt.show()


last_5_dfs = filtered_df_list[-1:]
combined_df = pd.concat(last_5_dfs, ignore_index=True)
average_fp_per_pos = combined_df.groupby('Pos')['FP_Total'].mean().reset_index()
average_fp_per_pos = average_fp_per_pos.sort_values('FP_Total', ascending=False)
plt.figure(figsize=(10, 6))
sns.barplot(x='Pos', y='FP_Total', data=average_fp_per_pos, palette='viridis')
plt.title('Average Fantasy Points per Player Position (Last 1 Year)')
plt.xlabel('Player Position')
plt.ylabel('Average Fantasy Points')
plt.show()


combined_df = pd.concat(filtered_df_list, ignore_index=True)
X = combined_df['FG%']
y = combined_df['FP_Total']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
intercept, slope = model.params
plt.figure(figsize=(10, 6))
sns.scatterplot(x='FG%', y='FP_Total', data=combined_df, s=100)
plt.plot(combined_df['FG%'], slope * combined_df['FG%'] + intercept, color='red')
plt.title('Effect of Field Goal Percentage on Fantasy Points')
plt.xlabel('Field Goal Percentage')
plt.ylabel('Fantasy Points')
plt.show()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               FP_Total   R-squared:                       0.072
Model:                            OLS   Adj. R-squared:                  0.072
Method:                 Least Squares   F-statistic:                     372.8
Date:                Sat, 18 May 2024   Prob (F-statistic):           4.62e-80
Time:                        23:35:36   Log-Likelihood:                -38828.
No. Observations:                4800   AIC:                         7.766e+04
Df Residuals:                    4798   BIC:                         7.767e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -154.7550     75.960     -2.037      0.042    -303.672      -5.838
FG%         3162.7308    163.799     19.309      0.000    2841.610    3483.852
==============================================================================
Omnibus:                      447.779   Durbin-Watson:                   0.336
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              583.826
Skew:                           0.805   Prob(JB):                    1.67e-127
Kurtosis:                       3.572   Cond. No.                         17.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


X = combined_df['3P%']
y = combined_df['FP_Total']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
intercept, slope = model.params
plt.figure(figsize=(10, 6))
sns.scatterplot(x='3P%', y='FP_Total', data=combined_df, s=100)
plt.plot(combined_df['3P%'], slope * combined_df['3P%'] + intercept, color='red')
plt.title('Effect of Three Point Percentage on Fantasy Points')
plt.xlabel('Three Point Percentage')
plt.ylabel('Fantasy Points')
plt.show()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               FP_Total   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                  0.012
Method:                 Least Squares   F-statistic:                     60.00
Date:                Sat, 18 May 2024   Prob (F-statistic):           1.15e-14
Time:                        23:35:37   Log-Likelihood:                -38978.
No. Observations:                4800   AIC:                         7.796e+04
Df Residuals:                    4798   BIC:                         7.797e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       1086.4137     29.422     36.926      0.000    1028.734    1144.094
3P%          687.2690     88.725      7.746      0.000     513.327     861.211
==============================================================================
Omnibus:                      501.808   Durbin-Watson:                   0.388
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              670.437
Skew:                           0.876   Prob(JB):                    2.61e-146
Kurtosis:                       3.535   Cond. No.                         8.26
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


combined_df = pd.concat(filtered_df_list, ignore_index=True)
X = combined_df['eFG%']
y = combined_df['FP_Total']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
intercept, slope = model.params
plt.figure(figsize=(10, 6))
sns.scatterplot(x='eFG%', y='FP_Total', data=combined_df, s=100)
plt.plot(combined_df['eFG%'], slope * combined_df['eFG%'] + intercept, color='red')
plt.title('Effect of Effective Field Goal Percentage on Fantasy Points')
plt.xlabel('Effective Field Goal Percentage')
plt.ylabel('Fantasy Points')
plt.show()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               FP_Total   R-squared:                       0.068
Model:                            OLS   Adj. R-squared:                  0.068
Method:                 Least Squares   F-statistic:                     351.0
Date:                Sat, 18 May 2024   Prob (F-statistic):           1.20e-75
Time:                        23:35:38   Log-Likelihood:                -38838.
No. Observations:                4800   AIC:                         7.768e+04
Df Residuals:                    4798   BIC:                         7.769e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const       -570.9918    100.267     -5.695      0.000    -767.561    -374.423
eFG%        3601.2584    192.215     18.736      0.000    3224.430    3978.087
==============================================================================
Omnibus:                      490.114   Durbin-Watson:                   0.383
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              653.471
Skew:                           0.853   Prob(JB):                    1.26e-142
Kurtosis:                       3.599   Cond. No.                         21.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


combined_df = pd.concat(filtered_df_list, ignore_index=True)
X = combined_df['FT%']
y = combined_df['FP_Total']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
intercept, slope = model.params
plt.figure(figsize=(10, 6))
sns.scatterplot(x='FT%', y='FP_Total', data=combined_df, s=100)
plt.plot(combined_df['FT%'], slope * combined_df['FT%'] + intercept, color='red')
plt.title('Effect of Free Throw Percentage on Fantasy Points')
plt.xlabel('Field Throw Percentage')
plt.ylabel('Fantasy Points')
plt.show()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               FP_Total   R-squared:                       0.047
Model:                            OLS   Adj. R-squared:                  0.047
Method:                 Least Squares   F-statistic:                     235.1
Date:                Sat, 18 May 2024   Prob (F-statistic):           7.53e-52
Time:                        23:35:39   Log-Likelihood:                -38893.
No. Observations:                4800   AIC:                         7.779e+04
Df Residuals:                    4798   BIC:                         7.780e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         71.8411     80.621      0.891      0.373     -86.212     229.894
FT%         1626.2832    106.056     15.334      0.000    1418.365    1834.201
==============================================================================
Omnibus:                      473.797   Durbin-Watson:                   0.450
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              625.994
Skew:                           0.835   Prob(JB):                    1.17e-136
Kurtosis:                       3.582   Cond. No.                         14.4
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


combined_df = pd.concat(filtered_df_list, ignore_index=True)
average_fp_per_age = combined_df.groupby('Age')['FP_Total'].mean().reset_index()
average_fp_per_age = average_fp_per_age.sort_values('Age')

plt.figure(figsize=(10, 6))
sns.barplot(x='Age', y='FP_Total', data=average_fp_per_age, palette='viridis')
plt.title('Average Fantasy Points per Age')
plt.xlabel('Age')
plt.ylabel('Average Fantasy Points')
plt.show()


last_season_df = filtered_df_list[-1]
second_last_season_df = filtered_df_list[-2]

merged_df = pd.merge(last_season_df, second_last_season_df, on='Player', suffixes=('_last', '_prev'))

merged_df['Team_Switch'] = merged_df['Tm_last'] != merged_df['Tm_prev']

switched_teams_df = merged_df[merged_df['Team_Switch']]

average_fp_last_season = switched_teams_df['FP_Total_last'].mean()
average_fp_prev_season = switched_teams_df['FP_Total_prev'].mean()

average_fp_diff = average_fp_last_season - average_fp_prev_season

plot_data = pd.DataFrame({
    'Season': ['Previous Season', 'Last Season'],
    'Average_FP': [average_fp_prev_season, average_fp_last_season]
})

plt.figure(figsize=(10, 6))
sns.barplot(x='Season', y='Average_FP', data=plot_data, palette='viridis')
plt.title('Average Fantasy Points for Players Who Switched Teams')
plt.xlabel('Season')
plt.ylabel('Average Fantasy Points')
plt.show()


scoring_metrics = ['FP_PTS', 'FP_3P', 'FP_FG', 'FP_FT', 'FP_TRB', 'FP_AST', 'FP_STL', 'FP_BLK']
combined_df = pd.concat(filtered_df_list, ignore_index=True)

scoring_sums = combined_df[scoring_metrics].sum()

scoring_percentage = (scoring_sums / scoring_sums.sum()) * 100

plt.figure(figsize=(10, 8))
plt.pie(scoring_percentage, labels=scoring_percentage.index, autopct='%1.1f%%', startangle=140)
plt.title('Weightage of Positive Scoring Metrics in Total Fantasy Points')
plt.axis('equal')
plt.show()


scoring_metrics = ['FP_PTS', 'FP_3P', 'FP_FG', 'FP_FT', 'FP_TRB', 'FP_AST', 'FP_STL', 'FP_BLK']
combined_df = pd.concat(filtered_df_list, ignore_index=True)

positions = combined_df['Pos'].unique()

fig, axes = plt.subplots(2, len(positions) // 2 + len(positions) % 2, figsize=(20, 10), subplot_kw=dict(aspect="equal"))
axes = axes.flatten()

for i, pos in enumerate(positions):
    pos_df = combined_df[combined_df['Pos'] == pos]
    correlations = pos_df[scoring_metrics + ['FP_Total']].corr()['FP_Total'].drop('FP_Total')
    absolute_correlations = correlations.abs()
    wedges, texts, autotexts = axes[i].pie(absolute_correlations, autopct='%1.1f%%', startangle=140)
    axes[i].set_title(f'Influence on FP_Total for {pos}')
    axes[i].legend(wedges, absolute_correlations.index, title="Metrics", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


combined_df = pd.concat(filtered_df_list, ignore_index=True)

negative_metrics = ['FP_FGA', 'FP_FTA', 'FP_TOV']

average_losses = combined_df[negative_metrics].mean().abs()

plt.figure(figsize=(10, 8))
wedges, texts, autotexts = plt.pie(average_losses, labels=average_losses.index, autopct='%1.1f%%', startangle=140)
plt.title('Weightage of Negative Scoring Metrics in Total Fantasy Points')
plt.axis('equal')
plt.legend(wedges, average_losses.index, title="Metrics", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
plt.show()


combined_df = pd.concat(filtered_df_list, ignore_index=True)
negative_metrics = ['FP_FGA', 'FP_FTA', 'FP_TOV']
positions = combined_df['Pos'].unique()
fig, axes = plt.subplots(2, len(positions) // 2 + len(positions) % 2, figsize=(20, 10), subplot_kw=dict(aspect="equal"))
axes = axes.flatten()

for i, pos in enumerate(positions):
    pos_df = combined_df[combined_df['Pos'] == pos]
    average_losses = pos_df[negative_metrics].mean().abs()
    
    wedges, texts, autotexts = axes[i].pie(average_losses, autopct='%1.1f%%', startangle=140)
    axes[i].set_title(f'Average Fantasy Points Loss for {pos}')
    
    axes[i].legend(wedges, average_losses.index, title="Metrics", loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))

for j in range(i + 1, len(axes)):
    fig.delaxes(axes[j])

plt.tight_layout()
plt.show()


model = smf.ols(formula='FP_Total ~ Age + G + Q("3P%") + GS + Q("eFG%") + Q("FG%") + Q("FT%") + Q("2P%")', data=combined_df).fit()
print(model.summary())

                            OLS Regression Results                            
==============================================================================
Dep. Variable:               FP_Total   R-squared:                       0.755
Model:                            OLS   Adj. R-squared:                  0.755
Method:                 Least Squares   F-statistic:                     1848.
Date:                Sat, 18 May 2024   Prob (F-statistic):               0.00
Time:                        23:35:47   Log-Likelihood:                -35630.
No. Observations:                4800   AIC:                         7.128e+04
Df Residuals:                    4791   BIC:                         7.134e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -1717.6257     76.216    -22.536      0.000   -1867.044   -1568.207
Age            2.0903      1.396      1.497      0.134      -0.647       4.827
G             15.1789      0.410     37.040      0.000      14.375      15.982
Q("3P%")     619.8662     58.410     10.612      0.000     505.355     734.377
GS            17.3570      0.248     69.936      0.000      16.870      17.844
Q("eFG%")  -1888.2648    226.474     -8.338      0.000   -2332.257   -1444.273
Q("FG%")    3000.0200    203.162     14.767      0.000    2601.730    3398.310
Q("FT%")     925.5394     61.534     15.041      0.000     804.904    1046.175
Q("2P%")     494.2475    158.791      3.113      0.002     182.944     805.551
==============================================================================
Omnibus:                      616.065   Durbin-Watson:                   1.751
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             1598.945
Skew:                           0.719   Prob(JB):                         0.00
Kurtosis:                       5.435   Cond. No.                     3.57e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.57e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


features = ['Age', 'G', 'GS', 'MP', '3P%', 'FT%', 'FG%','2P%', 'eFG%']
target = 'FP_Total'
X = combined_df[features]
y = combined_df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print("Coefficients:", model.coef_)
print("Intercept:", model.intercept_)

Mean Squared Error: 69486.39354592712
Coefficients: [ 1.96775973e+00 -9.92737027e+00  5.12621897e-01  1.18749422e+00
  2.50653080e+02  3.46327930e+02  2.93248928e+03  4.91634053e+02
 -1.61604972e+03]
Intercept: -955.7387567713192


X = combined_df[['Age', 'G', 'GS', 'MP', '3P%', 'FT%', 'FG%','2P%', 'eFG%']].values
y = combined_df['FP_Total'].values
X = (X - X.mean(axis=0)) / X.std(axis=0)
X = np.c_[np.ones(X.shape[0]), X]
theta = np.zeros(X.shape[1])
iterations = 1000
alpha = 0.01
def hypothesis(X, theta):
    return np.dot(X, theta)
def cost_function(X, y, theta):
    m = len(y)
    return (1 / (2 * m)) * np.sum((hypothesis(X, theta) - y) ** 2)

def gradient_descent(X, y, theta, alpha, iterations):
    m = len(y)
    cost_history = np.zeros(iterations)

    for iteration in range(iterations):
        error = hypothesis(X, theta) - y
        gradient = (1 / m) * np.dot(X.T, error)
        theta -= alpha * gradient
        cost_history[iteration] = cost_function(X, y, theta)
    
    return theta, cost_history

final_theta, cost_history = gradient_descent(X, y, theta, alpha, iterations)

print("Final theta:", final_theta)

plt.plot(range(iterations), cost_history)
plt.xlabel('Iterations')
plt.ylabel('Cost')
plt.title('Cost vs. Iterations')
plt.show()

Final theta: [1295.30949413    5.42380823  -21.4595378   181.03334471  562.84756762
   28.98779362   51.41053839  148.42640373   35.58089283  -52.91286839]


X = combined_df[['Age', 'G', 'GS', 'MP', '3P%', 'FT%', 'FG%', '2P%', 'eFG%']]

filter_condition = (X['Age'] != 0) & (X['G'] != 0) & (X['GS'] != 0) & (X['MP'] != 0) & (X['3P%'] != 0) & (X['FT%'] != 0) & (X['FG%'] != 0) & (X['2P%'] != 0) & (X['eFG%'] != 0)
X = X[filter_condition]
y = combined_df.loc[filter_condition, 'FP_Total']  # Adjust 'Total_FP' based on your actual target variable name

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

model_decisiontree = DecisionTreeRegressor(random_state=50)
model_decisiontree.fit(X_train, y_train)

DecisionTreeRegressor(random_state=50)


y_pred = model_decisiontree.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

MSE: 157483.8811369509


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=1)

model_randomforest = RandomForestRegressor(random_state=1)

model_randomforest.fit(X_train, y_train)

y_pred = model_randomforest.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE) for Random Forest:", mse)

feature_importance = model_randomforest.feature_importances_
sorted_idx = np.argsort(feature_importance)

plt.figure(figsize=(10, 6))
plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx], align='center')
plt.yticks(range(len(sorted_idx)), np.array(X.columns)[sorted_idx])
plt.xlabel('Feature Importance')
plt.title('Random Forest Feature Importance')
plt.show()

Mean Squared Error (MSE) for Random Forest: 61888.61990077519

	Player	Age	Pos	G	GS	MP	FG	FGA	3P	3PA	2P	2PA	FT	FTA	TRB	AST	STL	BLK	TOV	PTS	Tm	FG%	3P%	2P%	eFG%	FT%
265	Kevin Durant	25	SF	81	81	3122	849	1688	192	491	657	1197	703	805	598	445	103	59	285	2593	OKC	0.502962	0.391039	0.548872	0.559834	0.873292
335	Monta Ellis	28	SG	82	82	3023	576	1278	69	209	507	1069	339	430	295	471	141	23	264	1560	DAL	0.450704	0.330144	0.474275	0.477700	0.788372
111	DeMar DeRozan	24	SG	79	79	3017	604	1407	64	210	540	1197	519	630	343	313	86	28	176	1791	TOR	0.429282	0.304762	0.451128	0.452026	0.823810
65	Carmelo Anthony	29	PF	77	77	2982	743	1643	167	415	576	1228	459	541	622	242	95	51	198	2112	NYK	0.452222	0.402410	0.469055	0.503043	0.848429
237	John Wall	23	PG	82	82	2980	579	1337	108	308	471	1029	317	394	333	721	149	40	295	1583	WAS	0.433059	0.350649	0.457726	0.473448	0.804569

	Player	Age	Pos	G	GS	MP	FG	FGA	3P	3PA	2P	2PA	FT	FTA	TRB	AST	STL	BLK	TOV	PTS	Tm	FG%	3P%	2P%	eFG%	FT%
200	James Harden	25	SG	81	81	2981	647	1470	208	555	439	915	715	824	459	565	154	60	321	2217	HOU	0.440136	0.374775	0.479781	0.510884	0.867718
27	Andrew Wiggins	19	SF	82	82	2969	497	1137	39	126	458	1011	354	466	374	170	86	50	177	1387	MIN	0.437115	0.309524	0.453017	0.454266	0.759657
459	Trevor Ariza	29	SF	82	82	2930	366	910	194	555	172	355	122	143	459	209	152	17	141	1048	HOU	0.402198	0.349550	0.484507	0.508791	0.853147
91	Damian Lillard	24	PG	82	82	2925	590	1360	196	572	394	788	344	398	378	507	97	21	222	1720	POR	0.433824	0.342657	0.500000	0.505882	0.864322
79	Chris Paul	29	PG	82	82	2857	568	1170	139	349	429	821	289	321	376	838	156	15	190	1564	LAC	0.485470	0.398281	0.522533	0.544872	0.900312

	Player	Age	Pos	G	GS	MP	FG	FGA	3P	3PA	2P	2PA	FT	FTA	TRB	AST	STL	BLK	TOV	PTS	Tm	FG%	3P%	2P%	eFG%	FT%	FP_PTS	FP_3P	FP_FGA	FP_FG	FP_FTA	FP_FT	FP_TRB	FP_AST	FP_STL	FP_BLK	FP_TOV
24	Andrew Wiggins	21	SF	82	82	3048	709	1570	103	289	606	1281	412	542	328	189	82	30	187	1933	MIN	0.451592	0.356401	0.473068	0.484395	0.760148	1933	103	-1570	1418	-542	412	328	378	328	120	-374
257	Karl-Anthony Towns	21	C	82	82	3030	802	1480	101	275	701	1205	356	428	1007	220	56	103	212	2061	MIN	0.541892	0.367273	0.581743	0.576014	0.831776	2061	101	-1480	1604	-428	356	1007	440	224	412	-424
193	James Harden	27	PG	81	81	2947	674	1533	262	756	412	777	746	881	659	907	121	38	464	2356	HOU	0.439661	0.346561	0.530245	0.525114	0.846765	2356	262	-1533	1348	-881	746	659	1814	484	152	-928
159	Giannis Antetokounmpo	22	SF	80	80	2845	656	1259	49	180	607	1079	471	612	700	434	131	151	234	1832	MIL	0.521048	0.272222	0.562558	0.540508	0.769608	1832	49	-1259	1312	-612	471	700	868	524	604	-468
230	John Wall	26	PG	78	78	2836	647	1435	89	272	558	1163	422	527	326	831	157	49	322	1805	WAS	0.450871	0.327206	0.479794	0.481882	0.800759	1805	89	-1435	1294	-527	422	326	1662	628	196	-644

	Player	Age	Pos	G	GS	MP	FG	FGA	3P	3PA	2P	2PA	FT	FTA	TRB	AST	STL	BLK	TOV	PTS	Tm	FG%	3P%	2P%	eFG%	FT%	FP_PTS	FP_3P	FP_FGA	FP_FG	FP_FTA	FP_FT	FP_TRB	FP_AST	FP_STL	FP_BLK	FP_TOV	FP_Total
324	LeBron James	33	PF	82	82	3026	857	1580	149	406	708	1174	388	531	709	747	116	71	347	2251	CLE	0.542405	0.366995	0.603066	0.589557	0.730697	2251	149	-1580	1714	-531	388	709	1494	464	284	-694	4648
303	Khris Middleton	26	SF	82	82	2982	593	1272	146	407	447	865	320	362	429	328	119	21	191	1652	MIL	0.466195	0.358722	0.516763	0.523585	0.883978	1652	146	-1272	1186	-362	320	429	656	476	84	-382	2933
25	Andrew Wiggins	22	SF	82	82	2979	569	1300	112	338	457	962	202	314	358	160	91	51	138	1452	MIN	0.437692	0.331361	0.475052	0.480769	0.643312	1452	112	-1300	1138	-314	202	358	320	364	204	-276	2260
47	Bradley Beal	24	SG	82	82	2977	683	1484	199	530	484	954	292	369	363	373	96	36	214	1857	WAS	0.460243	0.375472	0.507338	0.527291	0.791328	1857	199	-1484	1366	-369	292	363	746	384	144	-428	3070
278	Jrue Holiday	27	SG	81	81	2927	615	1244	120	356	495	888	187	238	365	486	123	64	213	1537	NOP	0.494373	0.337079	0.557432	0.542605	0.785714	1537	120	-1244	1230	-238	187	365	972	492	256	-426	3251

The Guide to Build the Best Possible Fantasy Basketball Lineup

Nihal Garisa, Neha Konduru

Introduction

Part 1: Data Collection

Part 2: Data Cleaning

Part 3: Exploratory Analysis & Data Visualization

Part 4: Model: Analysis, Hypothesis Testing, and ML

Part 5: Interpretation - Insight and Policy Decision

References

	Rk	Player	Pos	Age	Tm	G	GS	MP	FG	FGA	FG%	3P	3PA	3P%	2P	2PA	2P%	eFG%	FT	FTA	FT%	ORB	DRB	TRB	AST	STL	BLK	TOV	PF	PTS
0	2	Jeff Adrien	PF	26	CHA	52	5	713	72	168	.429	0	2	.000	72	166	.434	.429	65	100	.650	68	128	196	36	18	27	32	80	209
1	3	Arron Afflalo	SF	27	ORL	64	64	2307	397	905	.439	72	240	.300	325	665	.489	.478	191	223	.857	29	210	239	206	40	11	138	137	1057
2	4	Josh Akognon	PG	26	DAL	3	0	9	2	4	.500	1	2	.500	1	2	.500	.625	0	0	NaN	0	1	1	1	0	0	0	3	5
3	5	Cole Aldrich	C	24	TOT	45	0	388	44	80	.550	0	0	NaN	44	80	.550	.550	12	20	.600	30	90	120	9	5	23	23	60	100
4	5	Cole Aldrich	C	24	HOU	30	0	213	23	43	.535	0	0	NaN	23	43	.535	.535	4	9	.444	12	45	57	6	3	9	14	41	50

	Player	Pos	Age	Tm	G	GS	MP	FG	FGA	3P	3PA	2P	2PA	FT	FTA	TRB	AST	STL	BLK	TOV	PTS
0	Quincy Acy	SF	23	TOR	7	0	61	6	14	2	5	4	9	5	8	15	4	4	3	2	19
1	Quincy Acy	SF	23	SAC	56	0	786	60	127	2	10	58	117	30	45	201	24	19	23	28	152
2	Steven Adams	C	20	OKC	81	20	1197	93	185	0	0	93	185	79	136	332	43	40	57	71	265
3	Jeff Adrien	PF	27	TOT	53	12	961	143	275	0	0	143	275	76	119	306	38	24	36	39	362
4	Jeff Adrien	PF	27	CHA	25	0	256	22	40	0	0	22	40	13	25	88	7	7	15	8	57
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
629	Nick Young	SG	28	LAL	64	9	1810	387	889	135	350	252	539	235	285	166	95	46	12	95	1144
630	Thaddeus Young	PF	25	PHI	79	78	2718	582	1283	90	292	492	991	163	229	476	182	167	36	165	1417
631	Player	Pos	Age	Tm	G	GS	MP	FG	FGA	3P	3PA	2P	2PA	FT	FTA	TRB	AST	STL	BLK	TOV	PTS
632	Cody Zeller	C	21	CHA	82	3	1416	172	404	0	1	172	403	146	200	353	92	40	41	87	490
633	Tyler Zeller	C	24	CLE	70	9	1049	156	290	0	1	156	289	87	121	282	36	18	38	60	399