# just our good old libraries

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# nothing to see here
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

# Load match-level batting performance data (includes fantasy points)
batters_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CMSC320/final project/Batting_data.csv")

# Preview the first few rows to confirm structure
display(batters_df.head())

# Check the shape of the dataset (rows × columns)
batters_df.shape

(15714, 17)

# Check if there are any missing (NaN) values in the DataFrame
has_missing = batters_df.isna().any().any()

# Display rows with missing values, if any
if has_missing:
    display(batters_df[batters_df.isna().any(axis=1)])
else:
    print("No missing values found in the dataset. Ready for analysis!")

No missing values found in the dataset. Ready for analysis!

# Display structure: data types, null values, column names
print(batters_df.info())

# Summary statistics for numeric columns (like runs, balls, Batting_FP)
print(batters_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15714 entries, 0 to 15713
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   match_id          15714 non-null  int64  
 1   season            15714 non-null  int64  
 2   match_name        15714 non-null  object 
 3   home_team         15714 non-null  object 
 4   away_team         15714 non-null  object 
 5   venue             15714 non-null  object 
 6   bowling_team      15714 non-null  object 
 7   batting_team      15714 non-null  object 
 8   batting_innings   15714 non-null  int64  
 9   fullName          15714 non-null  object 
 10  batting_position  15714 non-null  int64  
 11  runs              15714 non-null  int64  
 12  balls             15714 non-null  int64  
 13  fours             15714 non-null  int64  
 14  sixes             15714 non-null  int64  
 15  strike_rate       15714 non-null  float64
 16  Batting_FP        15714 non-null  int64  
dtypes: float64(1), int64(9), object(7)
memory usage: 2.0+ MB
None
           match_id        season  batting_innings  batting_position  \
count  1.571400e+04  15714.000000     15714.000000      15714.000000   
mean   8.703602e+05   2015.607038         1.490327          4.688303   
std    3.544485e+05      4.673169         0.499922          2.693836   
min    3.359820e+05   2008.000000         1.000000          1.000000   
25%    5.483120e+05   2012.000000         1.000000          2.000000   
50%    8.298190e+05   2015.000000         1.000000          4.000000   
75%    1.216506e+06   2020.000000         2.000000          7.000000   
max    1.370353e+06   2023.000000         2.000000         11.000000   

               runs         balls         fours         sixes   strike_rate  \
count  15714.000000  15714.000000  15714.000000  15714.000000  15714.000000   
mean      19.418417     15.019028      1.758814      0.748823    109.304781   
std       21.218681     13.592456      2.296097      1.331647     68.563722   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        3.000000      4.000000      0.000000      0.000000     66.660000   
50%       12.000000     11.000000      1.000000      0.000000    108.510000   
75%       28.000000     22.000000      3.000000      1.000000    150.000000   
max      175.000000     73.000000     19.000000     17.000000    600.000000   

         Batting_FP  
count  15714.000000  
mean      24.813733  
std       29.338779  
min       -8.000000  
25%        3.000000  
50%       14.000000  
75%       37.000000  
max      244.000000

# Load match-level bowling data (includes fantasy points and detailed dismissal stats)
bowlers_df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CMSC320/final project/Bowling_data.csv")

# Preview a few rows to confirm structure and included metrics
bowlers_df.head()

# Display basic info about column types and null values
print(bowlers_df.info())

# Show summary statistics for numerical columns (e.g. wickets, economy, Bowling_FP)
print(bowlers_df.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12111 entries, 0 to 12110
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   season           12111 non-null  int64  
 1   match_id         12111 non-null  int64  
 2   match_name       12111 non-null  object 
 3   home_team        12111 non-null  object 
 4   away_team        12111 non-null  object 
 5   batting_team     12111 non-null  object 
 6   bowling_team     12111 non-null  object 
 7   venue            12111 non-null  object 
 8   bowling_innings  12111 non-null  int64  
 9   fullName         12111 non-null  object 
 10  overs            12111 non-null  float64
 11  total_balls      12111 non-null  int64  
 12  dots             12111 non-null  int64  
 13  maidens          12111 non-null  int64  
 14  conceded         12111 non-null  int64  
 15  foursConceded    12111 non-null  int64  
 16  sixesConceded    12111 non-null  int64  
 17  wickets          12111 non-null  int64  
 18  economyRate      12111 non-null  float64
 19  wides            12111 non-null  int64  
 20  noballs          12111 non-null  int64  
 21  LBW              12111 non-null  int64  
 22  Hitwicket        12111 non-null  int64  
 23  CaughtBowled     12111 non-null  int64  
 24  Bowled           12111 non-null  int64  
 25  Overs_Bowled     12111 non-null  object 
 26  Bowling_FP       12111 non-null  int64  
dtypes: float64(2), int64(17), object(8)
memory usage: 2.5+ MB
None
             season      match_id  bowling_innings         overs  \
count  12111.000000  1.211100e+04     12111.000000  12111.000000   
mean    2015.541078  8.659673e+05         1.497977      3.222541   
std        4.655261  3.535763e+05         0.500017      1.023971   
min     2008.000000  3.359820e+05         1.000000      0.000000   
25%     2012.000000  5.483080e+05         1.000000      3.000000   
50%     2015.000000  8.298090e+05         1.000000      4.000000   
75%     2020.000000  1.216502e+06         2.000000      4.000000   
max     2023.000000  1.370353e+06         2.000000      4.000000   

        total_balls          dots       maidens      conceded  foursConceded  \
count  12111.000000  12111.000000  12111.000000  12111.000000   12111.000000   
mean      19.401040      7.382132      0.027661     26.039799       2.281562   
std        6.117762      3.853098      0.165010     10.769290       1.669463   
min        0.000000      0.000000      0.000000      0.000000       0.000000   
25%       18.000000      4.000000      0.000000     19.000000       1.000000   
50%       24.000000      7.000000      0.000000     26.000000       2.000000   
75%       24.000000     10.000000      0.000000     33.000000       3.000000   
max       24.000000     20.000000      2.000000     70.000000      11.000000   

       sixesConceded       wickets   economyRate         wides       noballs  \
count   12111.000000  12111.000000  12111.000000  12111.000000  12111.000000   
mean        0.971761      0.905293      8.364614      0.631327      0.083726   
std         1.093496      0.995712      3.153501      0.920313      0.319618   
min         0.000000      0.000000      0.000000      0.000000      0.000000   
25%         0.000000      0.000000      6.250000      0.000000      0.000000   
50%         1.000000      1.000000      8.000000      0.000000      0.000000   
75%         2.000000      1.000000     10.000000      1.000000      0.000000   
max         8.000000      6.000000     36.000000      7.000000      4.000000   

                LBW     Hitwicket  CaughtBowled        Bowled    Bowling_FP  
count  12111.000000  12111.000000  12111.000000  12111.000000  12111.000000  
mean       0.059450      0.001239      0.027248      0.166378     25.841962  
std        0.247397      0.035173      0.164326      0.425231     29.916351  
min        0.000000      0.000000      0.000000      0.000000     -6.000000  
25%        0.000000      0.000000      0.000000      0.000000      0.000000  
50%        0.000000      0.000000      0.000000      0.000000     25.000000  
75%        0.000000      0.000000      0.000000      0.000000     39.000000  
max        3.000000      1.000000      2.000000      4.000000    216.000000

india_venues = ['Narendra Modi Stadium, Motera, Ahmedabad',
 'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh',
 'Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow',
 'Rajiv Gandhi International Stadium, Uppal, Hyderabad',
 'M.Chinnaswamy Stadium, Bengaluru',
 'MA Chidambaram Stadium, Chepauk, Chennai', 'Arun Jaitley Stadium, Delhi',
 'Barsapara Cricket Stadium, Guwahati', 'Eden Gardens, Kolkata',
 'Wankhede Stadium, Mumbai', 'Sawai Mansingh Stadium, Jaipur'
 'Himachal Pradesh Cricket Association Stadium, Dharamsala',
 'Brabourne Stadium, Mumbai', 'Dr DY Patil Sports Academy, Navi Mumbai',
 'Maharashtra Cricket Association Stadium, Pune',
 'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam',
 'Holkar Cricket Stadium, Indore',
 'Saurashtra Cricket Association Stadium, Rajkot', 'Green Park, Kanpur',
 'Shaheed Veer Narayan Singh International Stadium, Raipur',
 'Sardar Patel (Gujarat) Stadium, Motera, Ahmedabad',
 'JSCA International Stadium Complex, Ranchi', 'Barabati Stadium, Cuttack',
 'Nehru Stadium, Kochi', 'Dr DY Patil Sports Academy, Mumbai',
 'Vidarbha Cricket Association Stadium, Jamtha, Nagpur']

active_teams = ['CSK', 'GT', 'PBKS', 'KKR', 'LSG', 'DC', 'RR', 'SRH', 'MI', 'RCB']

# Filter only Indian venues
batters_df = batters_df[batters_df['venue'].isin(india_venues)].copy()
bowlers_df = bowlers_df[bowlers_df['venue'].isin(india_venues)].copy()

# Standardize team name: GL → GT
batters_df.loc[batters_df['batting_team'] == 'GL', 'batting_team'] = 'GT'
bowlers_df.loc[bowlers_df['bowling_team'] == 'GL', 'bowling_team'] = 'GT'

# Remove performances by inactive teams
batters_df = batters_df[batters_df['batting_team'].isin(active_teams)].copy()
bowlers_df = bowlers_df[bowlers_df['bowling_team'].isin(active_teams)].copy()

# put this in data cleaning section on top

print(len(bowlers_df["match_id"].unique()))
print(len(batters_df["match_id"].unique()))

set(batters_df["match_id"].unique()) - set(bowlers_df["match_id"].unique())
condition = batters_df['match_id'] == 501265
rows_to_drop = batters_df[condition].index
batters_df = batters_df.drop(rows_to_drop)

791
792

# Count the number of matches played at each venue (after dropping duplicates)
unique_matches = batters_df[['match_id', 'venue']].drop_duplicates()
venue_counts = unique_matches['venue'].value_counts()

# Visualize match frequency by venue
plt.figure(figsize=(12, 6))
sns.barplot(x=venue_counts.index, y=venue_counts.values, palette='viridis')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Venue')
plt.ylabel('Number of Matches')
plt.title('Match Counts by Venue')
plt.tight_layout()
plt.show()

# Plot histogram of batting fantasy points (Batting_FP)
sns.histplot(batters_df['Batting_FP'], bins=30)
plt.xlabel('Batting_FP')
plt.ylabel('Count')
plt.title('Distribution of Batting Fantasy Points')
plt.show()

# Check batters with 0 balls faced — likely didn’t get to bat
bat_temp = batters_df[batters_df["balls"] == 0]
bat_temp.shape

# Check bowlers with 0 overs bowled — likely didn’t bowl
bowl_temp = bowlers_df[bowlers_df["overs"] == 0]
bowl_temp.shape

# Drop those rows (very few)
batters_df = batters_df.drop(bat_temp.index)
bowlers_df = bowlers_df.drop(bowl_temp.index)

# Sum total batting and bowling fantasy points per match

match_points_bat = pd.Series(batters_df.groupby("match_id")["Batting_FP"].sum())
match_points_bowl = pd.Series(bowlers_df.groupby("match_id")["Bowling_FP"].sum())

# Histogram of total batting points per match
match_points_bat.hist()
plt.title("Histogram of batting points per match")
plt.xlabel("batting points")
plt.ylabel("Frequency")
plt.show()

# Histogram of total bowling points per match
match_points_bowl.hist()
plt.title("Histogram of bowling points per match")
plt.xlabel("bowling points")
plt.ylabel("Frequency")
plt.show()

# Group by venue and compute average batting fantasy points


mean_fp = batters_df.groupby('venue')['Batting_FP'].mean().sort_values()

# Plot average fantasy points by venue (batting)
plt.figure(figsize=(10,6))
sns.barplot(x=mean_fp.index, y=mean_fp.values)
plt.xticks(rotation=45, ha='right')
plt.ylabel('Average Batting_FP')
plt.title('Average Batting Fantasy Points by Venue')
plt.tight_layout()
plt.show()

# Group by venue and compute average bowling fantasy points
mean_fp = bowlers_df.groupby('venue')['Bowling_FP'].mean().sort_values()

# Plot average fantasy points by venue (bowling)
plt.figure(figsize=(10,6))
sns.barplot(x=mean_fp.index, y=mean_fp.values)
plt.xticks(rotation=45, ha='right')
plt.ylabel('Average Bowling')
plt.title('Average Bowling Fantasy Points by Venue')
plt.tight_layout()
plt.show()

# creating a new feature called "at_home" to check if home ground advantage is real

team_home_venues = {
    'RCB': ['M.Chinnaswamy Stadium, Bengaluru'],
    'PBKS': [
        'Punjab Cricket Association IS Bindra Stadium, Mohali, Chandigarh',
        'Himachal Pradesh Cricket Association Stadium, Dharamsala'
    ],
    'DC': [
        'Arun Jaitley Stadium, Delhi',
        'Dr. Y.S. Rajasekhara Reddy ACA-VDCA Cricket Stadium, Visakhapatnam'
    ],
    'MI': [
        'Wankhede Stadium, Mumbai',
        'Dr DY Patil Sports Academy, Mumbai',
        'Brabourne Stadium, Mumbai'
    ],
    'KKR': ['Eden Gardens, Kolkata'],
    'RR': [
        'Sawai Mansingh Stadium, Jaipur',
        'Barsapara Cricket Stadium, Guwahati'
    ],
    'SRH': ['Rajiv Gandhi International Stadium, Uppal, Hyderabad'],
    'CSK': ['MA Chidambaram Stadium, Chepauk, Chennai'],
    'Kochi': ['Nehru Stadium, Kochi'],
    'PWI': ['Maharashtra Cricket Association Stadium, Pune'],
    'RPS': ['Maharashtra Cricket Association Stadium, Pune'],
    'GT': ['Narendra Modi Stadium, Motera, Ahmedabad', 'Saurashtra Cricket Association Stadium, Rajkot'],
    'LSG': ['Bharat Ratna Shri Atal Bihari Vajpayee Ekana Cricket Stadium, Lucknow']
}

neutral_venues = [
    'Barabati Stadium, Cuttack',
    'Vidarbha Cricket Association Stadium, Jamtha, Nagpur',
    'Shaheed Veer Narayan Singh International Stadium, Raipur',
    'JSCA International Stadium Complex, Ranchi',
    'Green Park, Kanpur'
]

# add feature to batters_df

def decipher_env_batters(row):
  if row["batting_team"] == row["home_team"]:
    return "home"
  else:
    return "away"

batters_df["env"] = batters_df.apply(decipher_env_batters, axis=1)

# similarly for bowlers_df

def decipher_env_batters(row):
  if row["bowling_team"] == row["home_team"]:
    return "home"
  else:
    return "away"

bowlers_df["env"] = bowlers_df.apply(decipher_env_batters, axis=1)

from scipy.stats import ttest_ind

# for batters:

# null hypothesis: there is no difference in the mean Batting_FP between home and away games
# alternate hypothesis: there is a difference in the mean Batting_FP between home and away games

home_scores = batters_df[batters_df['env'] == 'home']['Batting_FP']
away_scores = batters_df[batters_df['env'] == 'away']['Batting_FP']

t_stat, p_val = ttest_ind(home_scores, away_scores, equal_var=False)

print(f"T-statistic: {t_stat:.2f}")
print(f"P-value: {p_val:.4f}")

T-statistic: 1.76
P-value: 0.0789

# for bowlers:

# null hypothesis: there is no difference in the mean Bowling_FP between home and away games
# alternate hypothesis: there is a difference in the mean Bowling_FP between home and away games

home_scores = bowlers_df[bowlers_df['env'] == 'home']['Bowling_FP']
away_scores = bowlers_df[bowlers_df['env'] == 'away']['Bowling_FP']

t_stat, p_val = ttest_ind(home_scores, away_scores, equal_var=False)

print(f"T-statistic: {t_stat:.2f}")
print(f"P-value: {p_val:.4f}")

T-statistic: 2.13
P-value: 0.0328

# home game or away game helps in prediction thus adding a feature in both datasets
batters_df['is_home'] = (batters_df['env'] == 'home').astype(int)
bowlers_df['is_home'] = (bowlers_df['env'] == 'home').astype(int)

# Compute correlations only between numeric features
numeric_batters_df = batters_df.select_dtypes(include='number')
correlations_batters = numeric_batters_df.corr()

# Plot the correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlations_batters, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Heatmap (Numeric Features Only)')
plt.show()

# Compute N-match rolling average (excluding current match) for a player’s metric
# Impute either using career or global average if rolling window not available

def apply_rolling_avg(df, numeric_field, window_size=5):
    df = df.sort_values(by=['fullName', 'match_id'])

    rolling_avg_list = []

    for name, group in df.groupby(by=['fullName']):
        rolled = group[numeric_field].shift(1).rolling(window=window_size, min_periods=1).mean()
        rolling_avg_list.append(rolled)

    rolling_column_name = f"{numeric_field}_rolling{window_size}"
    df[rolling_column_name] = pd.concat(rolling_avg_list).sort_index()

    career_avg = df.groupby(by=['fullName'])[numeric_field].transform('mean')
    global_avg = df[numeric_field].mean()

    df[rolling_column_name] = (
        df[rolling_column_name]
        .fillna(career_avg)
        .fillna(global_avg)
    )

    return df

# window size = 5

batters_df = apply_rolling_avg(batters_df, 'Batting_FP', window_size=5)
batters_df = apply_rolling_avg(batters_df, 'runs', window_size=5)
batters_df = apply_rolling_avg(batters_df, 'balls', window_size=5)
batters_df = apply_rolling_avg(batters_df, 'fours', window_size=5)
batters_df = apply_rolling_avg(batters_df, 'sixes', window_size=5)
batters_df = apply_rolling_avg(batters_df, 'strike_rate', window_size=5)

bowlers_df = apply_rolling_avg(bowlers_df, 'Bowling_FP', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'overs', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'total_balls', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'dots', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'maidens', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'conceded', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'foursConceded', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'sixesConceded', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'wickets', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'economyRate', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'wides', window_size=5)
bowlers_df = apply_rolling_avg(bowlers_df, 'noballs', window_size=5)

# window size = 10

batters_df = apply_rolling_avg(batters_df, 'Batting_FP', window_size=10)
batters_df = apply_rolling_avg(batters_df, 'runs', window_size=10)
batters_df = apply_rolling_avg(batters_df, 'balls', window_size=10)
batters_df = apply_rolling_avg(batters_df, 'fours', window_size=10)
batters_df = apply_rolling_avg(batters_df, 'sixes', window_size=10)
batters_df = apply_rolling_avg(batters_df, 'strike_rate', window_size=10)

bowlers_df = apply_rolling_avg(bowlers_df, 'Bowling_FP', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'overs', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'total_balls', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'dots', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'maidens', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'conceded', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'foursConceded', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'sixesConceded', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'wickets', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'economyRate', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'wides', window_size=10)
bowlers_df = apply_rolling_avg(bowlers_df, 'noballs', window_size=10)

def add_career_stats(df, cols):
    df = df.sort_values(by=['fullName', 'match_id'])

    for col in cols:
        df[f'career_{col}_mean'] = (
            df.groupby('fullName')[col]
            .transform(lambda x: x.shift(1).expanding().mean())
        )

        global_avg = df[col].mean()
        df[f'career_{col}_mean'] = df[f'career_{col}_mean'].fillna(global_avg)

    return df

batters_col = [
    'Batting_FP',
    'strike_rate',
    'balls',
    'runs',
    'fours',
    'sixes'
]

batters_df = add_career_stats(batters_df, batters_col)

bowlers_col = [
    'Bowling_FP',
    'overs',
    'total_balls',
    'dots',
    'conceded',
    'foursConceded',
    'sixesConceded',
    'wickets',
    'economyRate',
    'wides',
    'noballs',
]

bowlers_df = add_career_stats(bowlers_df, bowlers_col)

bowlers_df = bowlers_df.sort_values(by=['fullName', 'match_id'])

bowlers_df['career_maidens_sum'] = (
    bowlers_df.groupby('fullName')['maidens']
    .transform(lambda x: x.shift(1).cumsum())
    .fillna(0)
)

from sklearn.preprocessing import LabelEncoder

# Define relevant batting features for correlation analysis
columns_of_interest = ['batting_position', 'Batting_FP_rolling5', 'runs_rolling5', 'balls_rolling5',
                       'fours_rolling5', 'sixes_rolling5', 'strike_rate_rolling5', 'is_home', 'career_Batting_FP_mean', 'career_strike_rate_mean', 'career_balls_mean',
                       'career_runs_mean', 'career_fours_mean', 'career_sixes_mean', 'Batting_FP', 'Batting_FP_rolling10', 'runs_rolling10', 'balls_rolling10',
                       'fours_rolling10', 'sixes_rolling10', 'strike_rate_rolling10']

# Calculate correlation with Batting_FP and sort by absolute strength
correlations = batters_df[columns_of_interest].corr()['Batting_FP'].drop("Batting_FP").sort_values(key=abs, ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=correlations.values, y=correlations.index, palette='coolwarm')
plt.title('Correlation of Selected Features with Batting Fantasy Points (Batting_FP)')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Feature')
plt.axvline(0, color='gray', linestyle='--')
plt.tight_layout()
plt.show()

# Define bowling-related features for correlation analysis
columns_of_interest =['Bowling_FP', 'Bowling_FP_rolling5','overs_rolling5','total_balls_rolling5','dots_rolling5','maidens_rolling5','conceded_rolling5','foursConceded_rolling5',
                      'sixesConceded_rolling5','wickets_rolling5','economyRate_rolling5','wides_rolling5','noballs_rolling5','career_Bowling_FP_mean','career_overs_mean','career_total_balls_mean',
                      'career_dots_mean','career_conceded_mean','career_foursConceded_mean','career_sixesConceded_mean','career_wickets_mean','career_economyRate_mean','career_wides_mean',
                      'career_noballs_mean','career_maidens_sum','is_home']

# Compute correlation values and sort by absolute impact
correlations = bowlers_df[columns_of_interest].corr()['Bowling_FP'].drop("Bowling_FP").sort_values(key=abs, ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x=correlations.values, y=correlations.index, palette='coolwarm')
plt.title('Correlation of Selected Features with Bowling Fantasy Points (Bowling_FP)')
plt.xlabel('Correlation Coefficient')
plt.ylabel('Feature')
plt.axvline(0, color='gray', linestyle='--')
plt.tight_layout()
plt.show()

# Define batting features based on rolling averages, career stats, and role/context
batting_features = ['batting_position', 'Batting_FP_rolling5', 'runs_rolling5', 'balls_rolling5',
                       'fours_rolling5', 'sixes_rolling5', 'strike_rate_rolling5', 'is_home', 'career_Batting_FP_mean', 'career_strike_rate_mean', 'career_balls_mean',
                       'career_runs_mean', 'career_fours_mean', 'career_sixes_mean', 'Batting_FP_rolling10', 'runs_rolling10', 'balls_rolling10',
                       'fours_rolling10', 'sixes_rolling10', 'strike_rate_rolling10']

batting_target = "Batting_FP"

from sklearn.model_selection import train_test_split

X = batters_df[batting_features]
y = batters_df[batting_target]
# Split data into train and test sets (80/20 split, fixed seed for reproducibility)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
import numpy as np

# Train multiple regression models and evaluate their performance
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Random Forest": RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
}

# Fit and evaluate each model

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # Evaluate using MAE, RMSE, and R²
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)

    print(f"--- {name} ---")
    print(f"MAE : {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R²  : {r2:.2f}")
    print()

--- Linear Regression ---
MAE : 20.63
RMSE: 27.86
R²  : 0.12

--- Ridge Regression ---
MAE : 20.63
RMSE: 27.86
R²  : 0.12

--- Random Forest ---
MAE : 20.95
RMSE: 28.24
R²  : 0.09

--- XGBoost ---
MAE : 21.02
RMSE: 28.67
R²  : 0.07

# Extract and plot XGBoost feature importances
importances = model.feature_importances_
feature_names = X_train.columns

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

# Visualize the most important features influencing Batting_FP
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='importance', y='feature', palette='viridis')
plt.title('XGBoost Feature Importances')
plt.tight_layout()
plt.show()

# Select relevant features for predicting Bowling_FP
bowling_features = ['Bowling_FP_rolling5','overs_rolling5','total_balls_rolling5','dots_rolling5','maidens_rolling5','conceded_rolling5','foursConceded_rolling5',
                      'sixesConceded_rolling5','wickets_rolling5','economyRate_rolling5','wides_rolling5','noballs_rolling5','career_Bowling_FP_mean','career_overs_mean','career_total_balls_mean',
                      'career_dots_mean','career_conceded_mean','career_foursConceded_mean','career_sixesConceded_mean','career_wickets_mean','career_economyRate_mean','career_wides_mean',
                      'career_noballs_mean','career_maidens_sum','is_home']

bowling_target = "Bowling_FP"

# Split data into train and test sets
X = bowlers_df[bowling_features]
y = bowlers_df[bowling_target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate XGBoost
model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=5, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MAE: 23.939470291137695
R² Score: -0.02789008617401123

from sklearn.ensemble import RandomForestRegressor

# Train and evaluate Random Forest
model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print("MAE:", mean_absolute_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

MAE: 23.253678022468044
R² Score: 0.010240976981151784

# Calculates how often the model predicts at least 'threshold' players correctly in each match
def match_accuracy_threshold_report(correct_series, threshold):
    total_matches = len(correct_series)
    successful_matches = (correct_series >= threshold).sum()
    percentage = (successful_matches / total_matches) * 100

    print(f"The model predicted at least {threshold} players correctly in {percentage:.2f}% of matches.")
    return percentage

# Main pipeline: trains model, evaluates with standard metrics, and analyzes match-level accuracy
def fantasy_model_pipeline(df, model_name, model, features, X_train, Y_train, X_test, Y_test, percent_correct):

  print(f"Model: {model_name}")

  model.fit(X_train, Y_train)
  y_pred = model.predict(X_test)
  y_proba = model.predict_proba(X_test)[:, 1]

  print("Accuracy:", accuracy_score(Y_test, y_pred))
  print("Precision:", precision_score(Y_test, y_pred))
  print("Recall:", recall_score(Y_test, y_pred))
  print("F1 Score:", f1_score(Y_test, y_pred))
  print("ROC AUC:", roc_auc_score(Y_test, y_proba))

  cm = confusion_matrix(Y_test, y_pred)
  sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
  plt.xlabel("Predicted")
  plt.ylabel("Actual")
  plt.title("Confusion Matrix")
  plt.show()

  df[f'predicted_target_by_{model_name}'] = model.predict(df[features])

  matches = df.groupby("match_id")

  corrects = []
  for match_id, rows in matches:
    sum = (rows["target"] == rows[f'predicted_target_by_{model_name}']).sum()
    corrects.append(sum)

  correct_series = pd.Series(corrects)

  thresholds = list(range(1, 12))
  percentages = [match_accuracy_threshold_report(correct_series, t) for t in thresholds]
  percent_correct[model_name] = percentages[6]

  plt.figure(figsize=(10, 5))
  plt.plot(thresholds, percentages, marker='o')
  plt.title("Match Accuracy: % of Matches with ≥ X Correct Predictions")
  plt.xlabel("Correct Predictions Threshold (Players)")
  plt.ylabel("Percentage of Matches")
  plt.grid(True)
  plt.show()

# Print averages for potential thresholds to help inform classification label boundary
print(batters_df[batters_df["Batting_FP"]>10]["Batting_FP"].mean())
print(batters_df[batters_df["Batting_FP"]>20]["Batting_FP"].mean())
print(batters_df[batters_df["Batting_FP"]>40]["Batting_FP"].mean())
print(batters_df[batters_df["Batting_FP"]>50]["Batting_FP"].mean())

42.43231641882569
53.14416719982939
71.00408618127786
79.09402546523016

# Create binary target column based on threshold
batters_df['target'] = (batters_df['Batting_FP'] >= 40).astype(int)

# select top features with highest correlations
batting_features = ['batting_position', 'Batting_FP_rolling5', 'runs_rolling5', 'balls_rolling5',
                       'fours_rolling5', 'sixes_rolling5', 'strike_rate_rolling5',
                       'career_Batting_FP_mean', 'career_strike_rate_mean', 'career_balls_mean',
                       'career_runs_mean', 'career_fours_mean', 'career_sixes_mean']

# Prepare train-test split for classification with stratification on target
X = batters_df[batting_features]
Y = batters_df['target']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, random_state=42, test_size=0.2)

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix

# Compute class imbalance ratio
ratio = Y_train.value_counts()[0] / Y_train.value_counts()[1]

# Define a list of classification models
models = {
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100, max_depth=6, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, scale_pos_weight=ratio, use_label_encoder=False, eval_metric='logloss', random_state=42),
    "Logistic Regression": LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42),
    "CatBoost": CatBoostClassifier(iterations=300, learning_rate=0.1, depth=6, class_weights=[1, ratio], verbose=0, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, class_weight='balanced', random_state=42, verbosity=-1),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, max_depth=6, class_weight='balanced', random_state=42),
    "Balanced Bagging": BalancedBaggingClassifier(estimator=DecisionTreeClassifier(max_depth=5), n_estimators=50, sampling_strategy='auto', replacement=False, random_state=42)
}

bat_percent_6_correct = {}
for name, model in models.items():
  fantasy_model_pipeline(batters_df, name, model, batting_features, X_train, Y_train, X_test, Y_test, bat_percent_6_correct)
  print("\n\n------------------------------------------------\n\n")

Model: RandomForestClassifier
Accuracy: 0.587273530711445
Precision: 0.34058577405857743
Recall: 0.7359855334538878
F1 Score: 0.465675057208238
ROC AUC: 0.7015941753116969

The model predicted at least 1 players correctly in 100.00% of matches.
The model predicted at least 2 players correctly in 99.62% of matches.
The model predicted at least 3 players correctly in 98.10% of matches.
The model predicted at least 4 players correctly in 95.45% of matches.
The model predicted at least 5 players correctly in 92.54% of matches.
The model predicted at least 6 players correctly in 87.86% of matches.
The model predicted at least 7 players correctly in 80.78% of matches.
The model predicted at least 8 players correctly in 69.03% of matches.
The model predicted at least 9 players correctly in 55.75% of matches.
The model predicted at least 10 players correctly in 42.35% of matches.
The model predicted at least 11 players correctly in 31.35% of matches.


------------------------------------------------


Model: XGBoost
Accuracy: 0.6288113124171454
Precision: 0.35490394337714865
Recall: 0.6347197106690777
F1 Score: 0.45525291828793774
ROC AUC: 0.6877816905132029

The model predicted at least 1 players correctly in 100.00% of matches.
The model predicted at least 2 players correctly in 99.87% of matches.
The model predicted at least 3 players correctly in 99.24% of matches.
The model predicted at least 4 players correctly in 97.85% of matches.
The model predicted at least 5 players correctly in 95.95% of matches.
The model predicted at least 6 players correctly in 93.68% of matches.
The model predicted at least 7 players correctly in 90.27% of matches.
The model predicted at least 8 players correctly in 83.69% of matches.
The model predicted at least 9 players correctly in 76.99% of matches.
The model predicted at least 10 players correctly in 68.27% of matches.
The model predicted at least 11 players correctly in 57.27% of matches.


------------------------------------------------


Model: Logistic Regression
Accuracy: 0.618647812638091
Precision: 0.35648148148148145
Recall: 0.6962025316455697
F1 Score: 0.47152480097979177
ROC AUC: 0.7088644607298837

The model predicted at least 1 players correctly in 100.00% of matches.
The model predicted at least 2 players correctly in 99.49% of matches.
The model predicted at least 3 players correctly in 97.72% of matches.
The model predicted at least 4 players correctly in 95.07% of matches.
The model predicted at least 5 players correctly in 92.29% of matches.
The model predicted at least 6 players correctly in 88.12% of matches.
The model predicted at least 7 players correctly in 79.52% of matches.
The model predicted at least 8 players correctly in 69.66% of matches.
The model predicted at least 9 players correctly in 57.65% of matches.
The model predicted at least 10 players correctly in 42.98% of matches.
The model predicted at least 11 players correctly in 31.86% of matches.


------------------------------------------------


Model: CatBoost
Accuracy: 0.6389748121961998
Precision: 0.36585365853658536
Recall: 0.650994575045208
F1 Score: 0.468445022771633
ROC AUC: 0.6907601281685226

The model predicted at least 1 players correctly in 100.00% of matches.
The model predicted at least 2 players correctly in 99.87% of matches.
The model predicted at least 3 players correctly in 99.24% of matches.
The model predicted at least 4 players correctly in 97.72% of matches.
The model predicted at least 5 players correctly in 95.58% of matches.
The model predicted at least 6 players correctly in 93.30% of matches.
The model predicted at least 7 players correctly in 90.39% of matches.
The model predicted at least 8 players correctly in 83.94% of matches.
The model predicted at least 9 players correctly in 75.98% of matches.
The model predicted at least 10 players correctly in 67.51% of matches.
The model predicted at least 11 players correctly in 56.76% of matches.


------------------------------------------------


Model: LightGBM
Accuracy: 0.6261599646486964
Precision: 0.3530591775325978
Recall: 0.6365280289330922
F1 Score: 0.4541935483870968
ROC AUC: 0.6834248067425949

The model predicted at least 1 players correctly in 100.00% of matches.
The model predicted at least 2 players correctly in 99.87% of matches.
The model predicted at least 3 players correctly in 99.12% of matches.
The model predicted at least 4 players correctly in 97.22% of matches.
The model predicted at least 5 players correctly in 95.83% of matches.
The model predicted at least 6 players correctly in 93.81% of matches.
The model predicted at least 7 players correctly in 89.51% of matches.
The model predicted at least 8 players correctly in 83.94% of matches.
The model predicted at least 9 players correctly in 74.72% of matches.
The model predicted at least 10 players correctly in 64.73% of matches.
The model predicted at least 11 players correctly in 55.88% of matches.

matches = batters_df.groupby("match_id")

# Count how many players per match are marked as actual/predicted success (target=1)
def get_target_pred_stats(matches_groups, feature):
    target_1_counts = []
    target_0_counts = []
    players_per_match = []

    for match_id, rows in matches_groups:
        # count the number of "target == 1" and "target == 0"
        target_1_count = (rows[feature] == 1).sum()
        target_0_count = (rows[feature] == 0).sum()

        # append counts to respective lists
        target_1_counts.append(target_1_count)
        target_0_counts.append(target_0_count)
        players_per_match.append(len(rows))

    # calculate avg, max, min
    avg_target_1 = sum(target_1_counts) / len(target_1_counts)
    max_target_1 = max(target_1_counts)
    min_target_1 = min(target_1_counts)

    avg_target_0 = sum(target_0_counts) / len(target_0_counts)
    max_target_0 = max(target_0_counts)
    min_target_0 = min(target_0_counts)

    avg_players = sum(players_per_match) / len(players_per_match)
    max_players = max(players_per_match)
    min_players = min(players_per_match)

    return {
        "Avg Target = 1": avg_target_1,
        "Max Target = 1": max_target_1,
        "Min Target = 1": min_target_1,
        "Avg Target = 0": avg_target_0,
        "Max Target = 0": max_target_0,
        "Min Target = 0": min_target_0,
        "Avg Players per Match": avg_players,
        "Max Players per Match": max_players,
        "Min Players per Match": min_players,
    }

# Visualize prediction distribution patterns across models
def show_custom_stats(matches_groups, model_names):
  stats = {}
  for model in model_names:
      stats[model] = get_target_pred_stats(matches_groups, model)

  stats_df = pd.DataFrame(stats).T

  display(stats_df)

  stats_df.index.name = 'Model'

  stats_df['Label'] = stats_df.index.map(short_labels)
  stats_df = stats_df.set_index('Label')

  sns.set(style="whitegrid", palette="muted")
  fig, axes = plt.subplots(3, 1, figsize=(10, 12))

  # Plot 1: Avg Target == 1
  sns.barplot(x=stats_df.index, y=stats_df['Avg Target = 1'], ax=axes[0], color='skyblue')
  axes[0].set_title('Average Target = 1 per Model')
  axes[0].set_ylabel('Average Count')
  axes[0].set_xlabel('')
  axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=30, ha='right')

  # Plot 2: Avg Target == 0
  sns.barplot(x=stats_df.index, y=stats_df['Avg Target = 0'], ax=axes[1], color='salmon')
  axes[1].set_title('Average Target = 0 per Model')
  axes[1].set_ylabel('Average Count')
  axes[1].set_xlabel('')
  axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=30, ha='right')

  # Plot 3: Avg Players per Match
  sns.barplot(x=stats_df.index, y=stats_df['Avg Players per Match'], ax=axes[2], color='lightgreen')
  axes[2].set_title('Average Players per Match')
  axes[2].set_ylabel('Average Count')
  axes[2].set_xlabel('')
  axes[2].set_xticklabels(axes[2].get_xticklabels(), rotation=30, ha='right')

  plt.tight_layout()
  plt.show()

model_names = [
    "target",  # Actual target
    "predicted_target_by_RandomForestClassifier",
    "predicted_target_by_XGBoost",
    "predicted_target_by_Logistic Regression",
    "predicted_target_by_CatBoost",
    "predicted_target_by_LightGBM",
    "predicted_target_by_Extra Trees",
    "predicted_target_by_Balanced Bagging",
]

# shorten labels for better plotting
short_labels = {
    "target": "Actual",
    "predicted_target_by_RandomForestClassifier": "RF",
    "predicted_target_by_XGBoost": "XGB",
    "predicted_target_by_Logistic Regression": "LR",
    "predicted_target_by_CatBoost": "CatBoost",
    "predicted_target_by_LightGBM": "LightGBM",
    "predicted_target_by_Extra Trees": "ExtraTrees",
    "predicted_target_by_Balanced Bagging": "BalancedBagging",
    'predicted_target_by_Final_XGBoost_Batters': "FinalBatters"
}

show_custom_stats(matches, model_names)

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter grid for tuning
param_grid = {
    'scale_pos_weight': [0.3 * ratio, 0.5 * ratio, ratio],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1],
    'min_child_weight': [5, 10],
    'gamma': [0, 0.1]
}

xgb = XGBClassifier(
    n_estimators=300,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# Set up grid search with cross-validation
grid = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Fit grid search on training data
grid.fit(X_train, Y_train)
print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)

Fitting 3 folds for each of 72 candidates, totalling 216 fits
Best params: {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 5, 'scale_pos_weight': np.float64(3.089019430637144)}
Best score: 0.4904597502905624

best_params = {
    'gamma': 0.1,
    'learning_rate': 0.05,
    'max_depth': 3,
    'min_child_weight': 10,
    'scale_pos_weight': 3.09
}


final_model = XGBClassifier(
    **best_params,
    n_estimators=300,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

final_model.fit(X_train, Y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=0.1, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.05, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=10, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=300,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric='logloss',
              feature_types=None, gamma=0.1, grow_policy=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=0.05, max_bin=None, max_cat_threshold=None,
              max_cat_to_onehot=None, max_delta_step=None, max_depth=3,
              max_leaves=None, min_child_weight=10, missing=nan,
              monotone_constraints=None, multi_strategy=None, n_estimators=300,
              n_jobs=None, num_parallel_tree=None, random_state=42, ...)

fantasy_model_pipeline(batters_df, "FinalXGBoost", final_model, batting_features, X_train, Y_train, X_test, Y_test, bat_percent_6_correct)

Model: FinalXGBoost
Accuracy: 0.5788775961113566
Precision: 0.3366013071895425
Recall: 0.7450271247739603
F1 Score: 0.4637028700056275
ROC AUC: 0.6987230735065512

The model predicted at least 1 players correctly in 100.00% of matches.
The model predicted at least 2 players correctly in 99.62% of matches.
The model predicted at least 3 players correctly in 97.98% of matches.
The model predicted at least 4 players correctly in 95.95% of matches.
The model predicted at least 5 players correctly in 92.54% of matches.
The model predicted at least 6 players correctly in 88.12% of matches.
The model predicted at least 7 players correctly in 79.52% of matches.
The model predicted at least 8 players correctly in 69.53% of matches.
The model predicted at least 9 players correctly in 54.49% of matches.
The model predicted at least 10 players correctly in 41.85% of matches.
The model predicted at least 11 players correctly in 30.47% of matches.

# Train model, predict targets, and compute per-match accuracy against a threshold

def evaluate_fantasy_model(model, X, y, df_meta, features, threshold):

    model.fit(X, y)
    df_meta['predicted_target'] = model.predict(X)

    grouped = df_meta.groupby('match_id')

    corrects = {}
    for match_id, group in grouped:
        correct = (group['predicted_target'] == group['target']).sum()
        corrects[match_id] = correct

    correct_series = pd.Series(corrects)
    percentage = match_accuracy_threshold_report(correct_series, threshold)
    return percentage

# Perform grid search over hyperparameters using a custom evaluation metric

from sklearn.model_selection import ParameterGrid

param_grid = {
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'min_child_weight': [5, 10],
    'scale_pos_weight': [0.3 * ratio, 0.5 * ratio, ratio],
    'gamma': [0, 0.1]
}

best_score = -1
best_params = None

for params in ParameterGrid(param_grid):
    model = XGBClassifier(
        n_estimators=300,
        eval_metric='logloss',
        random_state=42,
        **params
    )

    score = evaluate_fantasy_model(model, X, Y, df_meta=batters_df.copy(), features=batting_features, threshold=8)

    if score > best_score:
        best_score = score
        best_params = params

print("\nBest Params:", best_params)
print("Best Custom Metric:", best_score)

The model predicted at least 8 players correctly in 80.78% of matches.
The model predicted at least 8 players correctly in 82.05% of matches.
The model predicted at least 8 players correctly in 68.52% of matches.
The model predicted at least 8 players correctly in 80.91% of matches.
The model predicted at least 8 players correctly in 82.05% of matches.
The model predicted at least 8 players correctly in 69.79% of matches.
The model predicted at least 8 players correctly in 82.68% of matches.
The model predicted at least 8 players correctly in 86.98% of matches.
The model predicted at least 8 players correctly in 80.28% of matches.
The model predicted at least 8 players correctly in 82.68% of matches.
The model predicted at least 8 players correctly in 86.47% of matches.
The model predicted at least 8 players correctly in 77.62% of matches.
The model predicted at least 8 players correctly in 81.92% of matches.
The model predicted at least 8 players correctly in 83.94% of matches.
The model predicted at least 8 players correctly in 74.72% of matches.
The model predicted at least 8 players correctly in 81.04% of matches.
The model predicted at least 8 players correctly in 84.07% of matches.
The model predicted at least 8 players correctly in 74.46% of matches.
The model predicted at least 8 players correctly in 86.60% of matches.
The model predicted at least 8 players correctly in 90.90% of matches.
The model predicted at least 8 players correctly in 87.48% of matches.
The model predicted at least 8 players correctly in 85.34% of matches.
The model predicted at least 8 players correctly in 89.00% of matches.
The model predicted at least 8 players correctly in 86.09% of matches.
The model predicted at least 8 players correctly in 80.78% of matches.
The model predicted at least 8 players correctly in 82.05% of matches.
The model predicted at least 8 players correctly in 69.53% of matches.
The model predicted at least 8 players correctly in 80.91% of matches.
The model predicted at least 8 players correctly in 82.05% of matches.
The model predicted at least 8 players correctly in 69.79% of matches.
The model predicted at least 8 players correctly in 82.81% of matches.
The model predicted at least 8 players correctly in 87.23% of matches.
The model predicted at least 8 players correctly in 79.52% of matches.
The model predicted at least 8 players correctly in 82.81% of matches.
The model predicted at least 8 players correctly in 86.85% of matches.
The model predicted at least 8 players correctly in 77.37% of matches.
The model predicted at least 8 players correctly in 81.92% of matches.
The model predicted at least 8 players correctly in 83.94% of matches.
The model predicted at least 8 players correctly in 74.59% of matches.
The model predicted at least 8 players correctly in 81.04% of matches.
The model predicted at least 8 players correctly in 84.07% of matches.
The model predicted at least 8 players correctly in 74.08% of matches.
The model predicted at least 8 players correctly in 86.73% of matches.
The model predicted at least 8 players correctly in 90.64% of matches.
The model predicted at least 8 players correctly in 86.73% of matches.
The model predicted at least 8 players correctly in 85.46% of matches.
The model predicted at least 8 players correctly in 89.38% of matches.
The model predicted at least 8 players correctly in 85.59% of matches.

Best Params: {'gamma': 0, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 5, 'scale_pos_weight': np.float64(1.544509715318572)}
Best Custom Metric: 90.89759797724399

final_batters_model = XGBClassifier(
    gamma=0,
    learning_rate=0.1,
    max_depth=5,
    min_child_weight=5,
    scale_pos_weight=1.5454134658834162,
    n_estimators=300,
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

# using the helper functions defined way above when we started our classification quest
fantasy_model_pipeline(batters_df, "Final_XGBoost_Batters", final_batters_model, batting_features, X_train, Y_train, X_test, Y_test, bat_percent_6_correct)
show_custom_stats(matches, ["target", "predicted_target_by_Final_XGBoost_Batters"])

Model: Final_XGBoost_Batters
Accuracy: 0.7030490499337163
Precision: 0.36
Recall: 0.2766726943942134
F1 Score: 0.3128834355828221
ROC AUC: 0.6799440584583822

The model predicted at least 1 players correctly in 100.00% of matches.
The model predicted at least 2 players correctly in 100.00% of matches.
The model predicted at least 3 players correctly in 99.37% of matches.
The model predicted at least 4 players correctly in 98.48% of matches.
The model predicted at least 5 players correctly in 96.59% of matches.
The model predicted at least 6 players correctly in 94.56% of matches.
The model predicted at least 7 players correctly in 93.05% of matches.
The model predicted at least 8 players correctly in 89.00% of matches.
The model predicted at least 9 players correctly in 85.21% of matches.
The model predicted at least 10 players correctly in 78.00% of matches.
The model predicted at least 11 players correctly in 69.53% of matches.

# add model confidence to dataframe
# this helps us build our team
# batters with most confidence will be selected
batters_df["bat_confidence"] = final_model.predict_proba(X)[:, 1]

N = 6

# For each match, pick N players with the highest prediction confidence
my_batters_score = {}
# Group by match and collect predicted scores for top-N batters
matches = batters_df.groupby("match_id")
for match_id, rows in matches:
  # choose N batters which the model thinks have the best chance of performing
  batters = rows.sort_values(by="bat_confidence", ascending=False)
  bat_score = batters.head(N)["Batting_FP"].sum()
  my_batters_score[match_id] = bat_score
# Ground truth: best actual scoring batters in hindsight
actual_top_n_batters_score = {}

for match_id, rows in matches:
  actual_bat_score = rows.sort_values(by="Batting_FP", ascending=False).head(N)["Batting_FP"].sum()
  actual_top_n_batters_score[match_id] = actual_bat_score

pred_scores = np.array(list(my_batters_score.values()))
actual_scores = np.array(list(actual_top_n_batters_score.values()))

# summary stats for predicted
print("Model-Selected Team Stats:")
print(f"Avg:   {pred_scores.mean():.2f}")
print(f"Min:   {pred_scores.min():.2f}")
print(f"Max:   {pred_scores.max():.2f}")
print(f"Median:{np.median(pred_scores):.2f}")
print(f"Std:   {pred_scores.std():.2f}")
print(f"25th percentile: {np.percentile(pred_scores, 25):.2f}")
print(f"75th percentile: {np.percentile(pred_scores, 75):.2f}")
print()

# summary stats for actual top 5
print(f"Ideal Top-{N} Batter Stats:")
print(f"Avg:   {actual_scores.mean():.2f}")
print(f"Min:   {actual_scores.min():.2f}")
print(f"Max:   {actual_scores.max():.2f}")
print(f"Median:{np.median(actual_scores):.2f}")
print(f"Std:   {actual_scores.std():.2f}")
print(f"25th percentile: {np.percentile(actual_scores, 25):.2f}")
print(f"75th percentile: {np.percentile(actual_scores, 75):.2f}")
print()

# efficiency (higher the better, 1 is good)
efficiency = pred_scores.mean() / actual_scores.mean()
print(f"Average Efficiency: {efficiency:.2%}")

Model-Selected Team Stats:
Avg:   224.90
Min:   33.00
Max:   511.00
Median:220.00
Std:   85.36
25th percentile: 161.50
75th percentile: 282.00

Ideal Top-6 Batter Stats:
Avg:   306.73
Min:   68.00
Max:   585.00
Median:307.00
Std:   88.41
25th percentile: 250.00
75th percentile: 366.00

Average Efficiency: 73.32%

# calculate efficiency per match
efficiencies = [
    my_batters_score[m] / actual_top_n_batters_score[m]
    for m in my_batters_score
    if actual_top_n_batters_score[m] > 0
]

# average score efficiency
avg_efficiency = np.mean(efficiencies)
print(f"Average Score Efficiency: {avg_efficiency:.2%}")

# standard deviation
std_efficiency = np.std(efficiencies)
print(f"Std Dev of Efficiency: {std_efficiency:.2%}")

# number of matches evaluated
print(f"Matches Evaluated: {len(efficiencies)}")

# how often model scores at least 80% of ideal
win_threshold = 0.80
num_wins = sum(e >= win_threshold for e in efficiencies)
win_rate = num_wins / len(efficiencies)
print(f"% Matches where model got ≥ 80% of top 5 score: {win_rate:.2%} ({num_wins} matches)")

# best and worst matches
best_match = max(efficiencies)
worst_match = min(efficiencies)
print(f"Best Match Efficiency: {best_match:.2%}")
print(f"Worst Match Efficiency: {worst_match:.2%}")


plt.figure(figsize=(10, 5))
plt.hist(efficiencies, bins=20, color='skyblue', edgecolor='black')
plt.axvline(avg_efficiency, color='red', linestyle='--', label=f'Avg = {avg_efficiency:.2%}')
plt.title("Distribution of Match Score Efficiency")
plt.xlabel("My Score / Ideal Top 5 Score")
plt.ylabel("Number of Matches")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Average Score Efficiency: 73.38%
Std Dev of Efficiency: 17.31%
Matches Evaluated: 791
% Matches where model got ≥ 80% of top 5 score: 39.19% (310 matches)
Best Match Efficiency: 100.00%
Worst Match Efficiency: 19.68%

points = {}

for match_id, match_df in batters_df.groupby("match_id"):
  points[match_id] = match_df['Batting_FP'].sum()

points_values = list(points.values())
print("Average Match Points:", np.mean(points_values))
print("Max Points:", np.max(points_values))
print("Min Points:", np.min(points_values))
print("Standard Deviation:", np.std(points_values))

Average Match Points: 368.8356510745891
Max Points: 627
Min Points: 67
Standard Deviation: 105.79506125950432

# get top 6 batters of each match and get average of sums of these top 6 batting scores
top6_points = {}

for match_id, match_df in batters_df.groupby("match_id"):
  top6_points[match_id] = match_df['Batting_FP'].sort_values(ascending=False).head(7).sum()

points_values = list(top6_points.values())
print("Average Match Points:", np.mean(points_values))
print("Max Points:", np.max(points_values))
print("Min Points:", np.min(points_values))
print("Standard Deviation:", np.std(points_values))

Average Match Points: 324.9039190897598
Max Points: 600
Min Points: 70
Standard Deviation: 93.6346336797305

# follow similar method to find bolwer threshold

print(bowlers_df[bowlers_df["Bowling_FP"]>10]["Bowling_FP"].mean())
print(bowlers_df[bowlers_df["Bowling_FP"]>20]["Bowling_FP"].mean())
print(bowlers_df[bowlers_df["Bowling_FP"]>30]["Bowling_FP"].mean())
print(bowlers_df[bowlers_df["Bowling_FP"]>40]["Bowling_FP"].mean())

45.158686730506155
46.12264723740134
58.910755148741416
68.51900452488688

bowlers_df['target'] = (bowlers_df['Bowling_FP'] >= 30).astype(int)

# the lower threshold also aligns with the fact that bowlers tend to rack less points on average than batters

bowling_features = ['Bowling_FP_rolling5','overs_rolling5','total_balls_rolling5','dots_rolling5','maidens_rolling5','conceded_rolling5','foursConceded_rolling5',
                      'sixesConceded_rolling5','wickets_rolling5','economyRate_rolling5','wides_rolling5','noballs_rolling5','career_Bowling_FP_mean','career_overs_mean','career_total_balls_mean',
                      'career_dots_mean','career_conceded_mean','career_foursConceded_mean','career_sixesConceded_mean','career_wickets_mean','career_economyRate_mean','career_wides_mean',
                      'career_noballs_mean','career_maidens_sum','is_home']

X = bowlers_df[bowling_features]
Y = bowlers_df['target']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify=Y, random_state=42, test_size=0.2)

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier

# compute class imbalance ratio
ratio = Y_train.value_counts()[0] / Y_train.value_counts()[1]

# models for bowling performance classification
bowler_models = {
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100, max_depth=6, class_weight='balanced', random_state=42),
    "XGBoost": XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, scale_pos_weight=ratio, use_label_encoder=False, eval_metric='logloss', random_state=42),
    "Logistic Regression": LogisticRegression(class_weight='balanced', solver='liblinear', random_state=42),
    "CatBoost": CatBoostClassifier(iterations=300, learning_rate=0.1, depth=6, class_weights=[1, ratio], verbose=0, random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, class_weight='balanced', random_state=42, verbosity=-1),
    "Extra Trees": ExtraTreesClassifier(n_estimators=100, max_depth=6, class_weight='balanced', random_state=42),
    "Balanced Bagging": BalancedBaggingClassifier(estimator=DecisionTreeClassifier(max_depth=5), n_estimators=50, sampling_strategy='auto', replacement=False, random_state=42)
}

bowl_percent_6_correct = {}
for name, model in models.items():
  fantasy_model_pipeline(bowlers_df, name, model, bowling_features, X_train, Y_train, X_test, Y_test, bowl_percent_6_correct)
  print("\n\n------------------------------------------------\n\n")

Model: RandomForestClassifier
Accuracy: 0.5112866817155757
Precision: 0.38537906137184114
Recall: 0.6977124183006536
F1 Score: 0.4965116279069767
ROC AUC: 0.5871675681766959

The model predicted at least 1 players correctly in 99.87% of matches.
The model predicted at least 2 players correctly in 99.24% of matches.
The model predicted at least 3 players correctly in 97.35% of matches.
The model predicted at least 4 players correctly in 91.28% of matches.
The model predicted at least 5 players correctly in 80.28% of matches.
The model predicted at least 6 players correctly in 64.73% of matches.
The model predicted at least 7 players correctly in 46.14% of matches.
The model predicted at least 8 players correctly in 29.33% of matches.
The model predicted at least 9 players correctly in 13.78% of matches.
The model predicted at least 10 players correctly in 5.44% of matches.
The model predicted at least 11 players correctly in 1.64% of matches.


------------------------------------------------


Model: XGBoost
Accuracy: 0.47686230248307
Precision: 0.36618521665250636
Recall: 0.704248366013072
F1 Score: 0.48183342649524874
ROC AUC: 0.5470848828036962

The model predicted at least 1 players correctly in 100.00% of matches.
The model predicted at least 2 players correctly in 99.87% of matches.
The model predicted at least 3 players correctly in 98.99% of matches.
The model predicted at least 4 players correctly in 95.58% of matches.
The model predicted at least 5 players correctly in 90.14% of matches.
The model predicted at least 6 players correctly in 83.82% of matches.
The model predicted at least 7 players correctly in 72.69% of matches.
The model predicted at least 8 players correctly in 56.26% of matches.
The model predicted at least 9 players correctly in 34.89% of matches.
The model predicted at least 10 players correctly in 18.84% of matches.
The model predicted at least 11 players correctly in 8.22% of matches.


------------------------------------------------


Model: Logistic Regression
Accuracy: 0.5417607223476298
Precision: 0.40118577075098816
Recall: 0.6633986928104575
F1 Score: 0.5
ROC AUC: 0.5910919540229885

The model predicted at least 1 players correctly in 99.87% of matches.
The model predicted at least 2 players correctly in 98.74% of matches.
The model predicted at least 3 players correctly in 96.21% of matches.
The model predicted at least 4 players correctly in 89.63% of matches.
The model predicted at least 5 players correctly in 75.22% of matches.
The model predicted at least 6 players correctly in 59.92% of matches.
The model predicted at least 7 players correctly in 43.24% of matches.
The model predicted at least 8 players correctly in 26.68% of matches.
The model predicted at least 9 players correctly in 14.03% of matches.
The model predicted at least 10 players correctly in 5.69% of matches.
The model predicted at least 11 players correctly in 1.64% of matches.


------------------------------------------------


Model: CatBoost
Accuracy: 0.4898419864559819
Precision: 0.3797364085667216
Recall: 0.7532679738562091
F1 Score: 0.5049288061336255
ROC AUC: 0.5738914243858464

The model predicted at least 1 players correctly in 99.87% of matches.
The model predicted at least 2 players correctly in 99.75% of matches.
The model predicted at least 3 players correctly in 99.24% of matches.
The model predicted at least 4 players correctly in 95.32% of matches.
The model predicted at least 5 players correctly in 88.75% of matches.
The model predicted at least 6 players correctly in 80.78% of matches.
The model predicted at least 7 players correctly in 68.90% of matches.
The model predicted at least 8 players correctly in 47.66% of matches.
The model predicted at least 9 players correctly in 28.95% of matches.
The model predicted at least 10 players correctly in 14.79% of matches.
The model predicted at least 11 players correctly in 5.56% of matches.


------------------------------------------------


Model: LightGBM
Accuracy: 0.5643340857787811
Precision: 0.39473684210526316
Recall: 0.49019607843137253
F1 Score: 0.43731778425655976
ROC AUC: 0.5704037074599955

The model predicted at least 1 players correctly in 100.00% of matches.
The model predicted at least 2 players correctly in 100.00% of matches.
The model predicted at least 3 players correctly in 99.75% of matches.
The model predicted at least 4 players correctly in 99.12% of matches.
The model predicted at least 5 players correctly in 94.69% of matches.
The model predicted at least 6 players correctly in 90.64% of matches.
The model predicted at least 7 players correctly in 86.22% of matches.
The model predicted at least 8 players correctly in 78.63% of matches.
The model predicted at least 9 players correctly in 64.85% of matches.
The model predicted at least 10 players correctly in 44.12% of matches.
The model predicted at least 11 players correctly in 23.51% of matches.

percent_6_correct_list = sorted(bowl_percent_6_correct.items(), key=lambda item: item[1], reverse=True)
print(percent_6_correct_list)

[('LightGBM', np.float64(86.21997471554994)), ('XGBoost', np.float64(72.69279393173198)), ('CatBoost', np.float64(68.90012642225032)), ('RandomForestClassifier', np.float64(46.144121365360306)), ('Logistic Regression', np.float64(43.23640960809102)), ('Balanced Bagging', np.float64(42.22503160556258)), ('Extra Trees', np.float64(39.19089759797724))]

matches = bowlers_df.groupby("match_id")

model_names = [
    "target",  # Actual target
    "predicted_target_by_RandomForestClassifier",
    "predicted_target_by_XGBoost",
    "predicted_target_by_Logistic Regression",
    "predicted_target_by_CatBoost",
    "predicted_target_by_LightGBM",
    "predicted_target_by_Extra Trees",
    "predicted_target_by_Balanced Bagging",
]

# shorten labels for better plotting
short_labels = {
    "target": "Actual",
    "predicted_target_by_RandomForestClassifier": "RF",
    "predicted_target_by_XGBoost": "XGB",
    "predicted_target_by_Logistic Regression": "LR",
    "predicted_target_by_CatBoost": "CatBoost",
    "predicted_target_by_LightGBM": "LightGBM",
    "predicted_target_by_Extra Trees": "ExtraTrees",
    "predicted_target_by_Balanced Bagging": "BalancedBagging",
    "predicted_target_by_Final_lightGBM_bowlers": "FinalBowlers"

}

show_custom_stats(matches, model_names)

from lightgbm import LGBMClassifier
from itertools import product

param_grid = {
    'num_leaves': [15, 31],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'n_estimators': [200],
    'min_child_samples': [10, 20],
    'subsample': [0.8],
    'colsample_bytree': [0.8],
    'class_weight': ['balanced']
}

from copy import deepcopy
import numpy as np

best_score = -1
best_params = None

# Create all combinations
param_combinations = list(product(*param_grid.values()))
param_names = list(param_grid.keys())

for combo in param_combinations:
    params = dict(zip(param_names, combo))

    model = LGBMClassifier(random_state=42, verbose=-1, **params)

    score = evaluate_fantasy_model(model, X, Y, bowlers_df.copy(), bowling_features, threshold=6)

    print(f"Params: {params} → Score: {score:.2f}%")

    if score > best_score:
        best_score = score
        best_params = deepcopy(params)

print("\nBest Parameters:", best_params)
print(f"Best Fantasy Score (% matches with ≥6 correct): {best_score:.2f}%")

The model predicted at least 6 players correctly in 71.30% of matches.
Params: {'num_leaves': 15, 'max_depth': 3, 'learning_rate': 0.05, 'n_estimators': 200, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 71.30%
The model predicted at least 6 players correctly in 70.54% of matches.
Params: {'num_leaves': 15, 'max_depth': 3, 'learning_rate': 0.05, 'n_estimators': 200, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 70.54%
The model predicted at least 6 players correctly in 80.15% of matches.
Params: {'num_leaves': 15, 'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 200, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 80.15%
The model predicted at least 6 players correctly in 79.01% of matches.
Params: {'num_leaves': 15, 'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 200, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 79.01%
The model predicted at least 6 players correctly in 83.82% of matches.
Params: {'num_leaves': 15, 'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 200, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 83.82%
The model predicted at least 6 players correctly in 84.07% of matches.
Params: {'num_leaves': 15, 'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 200, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 84.07%
The model predicted at least 6 players correctly in 90.14% of matches.
Params: {'num_leaves': 15, 'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 200, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 90.14%
The model predicted at least 6 players correctly in 90.77% of matches.
Params: {'num_leaves': 15, 'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 200, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 90.77%
The model predicted at least 6 players correctly in 71.30% of matches.
Params: {'num_leaves': 31, 'max_depth': 3, 'learning_rate': 0.05, 'n_estimators': 200, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 71.30%
The model predicted at least 6 players correctly in 70.54% of matches.
Params: {'num_leaves': 31, 'max_depth': 3, 'learning_rate': 0.05, 'n_estimators': 200, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 70.54%
The model predicted at least 6 players correctly in 80.15% of matches.
Params: {'num_leaves': 31, 'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 200, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 80.15%
The model predicted at least 6 players correctly in 79.01% of matches.
Params: {'num_leaves': 31, 'max_depth': 3, 'learning_rate': 0.1, 'n_estimators': 200, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 79.01%
The model predicted at least 6 players correctly in 88.24% of matches.
Params: {'num_leaves': 31, 'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 200, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 88.24%
The model predicted at least 6 players correctly in 88.24% of matches.
Params: {'num_leaves': 31, 'max_depth': 5, 'learning_rate': 0.05, 'n_estimators': 200, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 88.24%
The model predicted at least 6 players correctly in 92.04% of matches.
Params: {'num_leaves': 31, 'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 200, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 92.04%
The model predicted at least 6 players correctly in 90.64% of matches.
Params: {'num_leaves': 31, 'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 200, 'min_child_samples': 20, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'} → Score: 90.64%

Best Parameters: {'num_leaves': 31, 'max_depth': 5, 'learning_rate': 0.1, 'n_estimators': 200, 'min_child_samples': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 'class_weight': 'balanced'}
Best Fantasy Score (% matches with ≥6 correct): 92.04%

final_bowler_model = LGBMClassifier(
    num_leaves=31,
    max_depth=5,
    learning_rate=0.1,
    n_estimators=200,
    min_child_samples=10,
    subsample=0.8,
    colsample_bytree=0.8,
    class_weight='balanced',
    random_state=42,
    verbose=-1  # suppress training messages
)

fantasy_model_pipeline(bowlers_df, "Final_lightGBM_bowlers", final_bowler_model, bowling_features, X_train, Y_train, X_test, Y_test, bowl_percent_6_correct)
show_custom_stats(matches, ["target", "predicted_target_by_Final_lightGBM_bowlers"])

Model: Final_lightGBM_bowlers
Accuracy: 0.551354401805869
Precision: 0.3748290013679891
Recall: 0.4477124183006536
F1 Score: 0.40804169769173493
ROC AUC: 0.5601349447825107

The model predicted at least 1 players correctly in 100.00% of matches.
The model predicted at least 2 players correctly in 100.00% of matches.
The model predicted at least 3 players correctly in 99.75% of matches.
The model predicted at least 4 players correctly in 99.24% of matches.
The model predicted at least 5 players correctly in 95.70% of matches.
The model predicted at least 6 players correctly in 91.15% of matches.
The model predicted at least 7 players correctly in 86.73% of matches.
The model predicted at least 8 players correctly in 80.53% of matches.
The model predicted at least 9 players correctly in 68.77% of matches.
The model predicted at least 10 players correctly in 47.28% of matches.
The model predicted at least 11 players correctly in 26.55% of matches.

# as before, add model confidence in bowlers for team selection
bowlers_df["bowl_confidence"] = final_bowler_model.predict_proba(X)[:, 1]

N = 5

# For each match, sort bowlers by model confidence and select top 5 (or N) as predicted top bowlers
# Then compute actual top 5 based on true Bowling Fantasy Points (Bowling_FP)

my_bowlers_score = {}

matches = bowlers_df.groupby("match_id")
for match_id, rows in matches:
  # choose N bowlers which the model thinks have the best chance of performing
  bowlers = rows.sort_values(by="bowl_confidence", ascending=False)
  bowl_score = bowlers.head(N)["Bowling_FP"].sum()
  my_bowlers_score[match_id] = bowl_score

actual_top_n_bowlers_score = {}

for match_id, rows in matches:
  actual_bowler_score = rows.sort_values(by="Bowling_FP", ascending=False).head(N)["Bowling_FP"].sum()
  actual_top_n_bowlers_score[match_id] = actual_bowler_score

pred_scores = np.array(list(my_bowlers_score.values()))
actual_scores = np.array(list(actual_top_n_bowlers_score.values()))

print("Model-Selected Team Stats:")
print(f"Avg:   {pred_scores.mean():.2f}")
print(f"Min:   {pred_scores.min():.2f}")
print(f"Max:   {pred_scores.max():.2f}")
print(f"Median:{np.median(pred_scores):.2f}")
print(f"Std:   {pred_scores.std():.2f}")
print(f"25th percentile: {np.percentile(pred_scores, 25):.2f}")
print(f"75th percentile: {np.percentile(pred_scores, 75):.2f}")
print()

print(f"Ideal Top-{N} Bowler Stats:")
print(f"Avg:   {actual_scores.mean():.2f}")
print(f"Min:   {actual_scores.min():.2f}")
print(f"Max:   {actual_scores.max():.2f}")
print(f"Median:{np.median(actual_scores):.2f}")
print(f"Std:   {actual_scores.std():.2f}")
print(f"25th percentile: {np.percentile(actual_scores, 25):.2f}")
print(f"75th percentile: {np.percentile(actual_scores, 75):.2f}")
print()

efficiency = pred_scores.mean() / actual_scores.mean()
print(f"Average Efficiency: {efficiency:.2%}")

Model-Selected Team Stats:
Avg:   204.72
Min:   -16.00
Max:   440.00
Median:203.00
Std:   76.65
25th percentile: 153.00
75th percentile: 252.50

Ideal Top-5 Bowler Stats:
Avg:   248.97
Min:   29.00
Max:   440.00
Median:249.00
Std:   74.18
25th percentile: 199.00
75th percentile: 297.00

Average Efficiency: 82.23%

# calculate normalized efficiency per match
efficiencies = [
    my_bowlers_score[m] / actual_top_n_bowlers_score[m]
    for m in my_bowlers_score
    if actual_top_n_bowlers_score[m] > 0
]

# average score efficiency
avg_efficiency = np.mean(efficiencies)
print(f"Average Score Efficiency: {avg_efficiency:.2%}")

# standard deviation
std_efficiency = np.std(efficiencies)
print(f"Std Dev of Efficiency: {std_efficiency:.2%}")

# number of matches evaluated
print(f"Matches Evaluated: {len(efficiencies)}")

# how often model scores at least 80% of ideal
win_threshold = 0.80
num_wins = sum(e >= win_threshold for e in efficiencies)
win_rate = num_wins / len(efficiencies)
print(f"% Matches where model got ≥ 80% of top 5 score: {win_rate:.2%} ({num_wins} matches)")

# best and worst matches
best_match = max(efficiencies)
worst_match = min(efficiencies)
print(f"Best Match Efficiency: {best_match:.2%}")
print(f"Worst Match Efficiency: {worst_match:.2%}")


plt.figure(figsize=(10, 5))
plt.hist(efficiencies, bins=20, color='skyblue', edgecolor='black')
plt.axvline(avg_efficiency, color='red', linestyle='--', label=f'Avg = {avg_efficiency:.2%}')
# plt.axvline(win_threshold, color='green', linestyle='--', label=f'80% Threshold')
plt.title("Distribution of Match Score Efficiency")
plt.xlabel("My Score / Ideal Top 5 Score")
plt.ylabel("Number of Matches")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Average Score Efficiency: 81.70%
Std Dev of Efficiency: 16.10%
Matches Evaluated: 791
% Matches where model got ≥ 80% of top 5 score: 61.57% (487 matches)
Best Match Efficiency: 100.00%
Worst Match Efficiency: -13.45%

bowlers = list(bowlers_df["fullName"].unique())
batters = list(batters_df["fullName"].unique())
wicketkeepers = {
    'MS Dhoni', 'Rishabh Pant', 'Sanju Samson', 'KL Rahul', 'Ishan Kishan',
    'Quinton de Kock', 'Dinesh Karthik', 'Wriddhiman Saha', 'Jonny Bairstow',
    'Jos Buttler', 'Heinrich Klaasen', 'Matthew Wade', 'Nicholas Pooran',
    'Philip Salt', 'Devon Conway', 'Sam Billings', 'Tom Latham',
    'Ben Duckett', 'Rahmanullah Gurbaz', 'Kusal Perera', 'Mohammad Rizwan',
    'Alex Carey', 'Mohammad Haris', 'Josh Inglis', 'Tom Banton',
    'Nurul Hasan', 'Niroshan Dickwella', 'Litton Das', 'Tim Seifert',
    'Sebastian Klaassen', 'Anuj Rawat', 'Prabhsimran Singh', 'KS Bharat',
    'Sheldon Jackson', 'Vishnu Vinod', 'Arun Jadhav', 'Heet Shah'
}

def in_bowlers(name):
  return name in bowlers

def in_batters(name):
  return name in batters

def in_keepers(name):
  return name in wicketkeepers

batters = {}
player_groups = batters_df.groupby("fullName")
for player_name, rows in player_groups:
  last_match = rows.loc[rows['match_id'].idxmax()]
  batters[player_name] = last_match["career_Batting_FP_mean"]

bowlers = {}
player_groups = bowlers_df.groupby("fullName")
for player_name, rows in player_groups:
  last_match = rows.loc[rows['match_id'].idxmax()]
  bowlers[player_name] = last_match["career_Bowling_FP_mean"]

def add_batter_role(batter_row):
  name = batter_row['fullName']
  bat_fp = batters[name]

  if in_keepers(name):
      return 'wk'

  elif in_batters(name) and not in_bowlers(name):
      return 'batter'

  elif in_bowlers(name) and not in_batters(name):
      print("rare case 1 occured")
      return 'bowler'

  bowl_fp = bowlers[name]

  if bat_fp >= 25 and bowl_fp >= 20:
      return 'all-rounder'

  elif bat_fp >= 20:
      return 'batter'

  elif bowl_fp >= 15:
      return 'bowler'

  else:
      return 'uncategorized'

batters_df["role"] = batters_df.apply(add_batter_role, axis=1)

batters_df["role"].hist()

<Axes: >

batters_df[batters_df["role"] == "uncategorized"]["fullName"].unique()

player_roles = {
    'Abhimanyu Mithun': 'bowler',
    'Abhishek Jhunjhunwala': 'bowler',
    'Abhishek Nayar': 'all-rounder',
    'Ankeet Chavan': 'bowler',
    'Ankit Soni': 'bowler',
    'Asad Pathan': 'all-rounder',
    'Balachandra Akhil': 'all-rounder',
    'Chamara Kapugedera': 'batter',
    'Dan Christian': 'all-rounder',
    'Daryl Mitchell': 'all-rounder',
    'Deepak Hooda': 'all-rounder',
    'Dinesh Salunkhe': 'bowler',
    'Hanuma Vihari': 'batter',
    'Hrithik Shokeen': 'all-rounder',
    'James Neesham': 'all-rounder',
    'Jayant Yadav': 'bowler',
    'Jeevan Mendis': 'all-rounder',
    'Jhye Richardson': 'bowler',
    'KP Appanna': 'bowler',
    'Karan Goel': 'batter',
    'Kartik Tyagi': 'bowler',
    'Kyle Abbott': 'bowler',
    'Marlon Samuels': 'all-rounder',
    'Mohammad Hafeez': 'all-rounder',
    'Pankaj Singh': 'bowler',
    'Parvez Rasool': 'all-rounder',
    'Rahul Tewatia': 'all-rounder',
    'Rajagopal Sathish': 'all-rounder',
    'Ramesh Powar': 'bowler',
    'Rasikh Salam': 'bowler',
    'Sachin Rana': 'all-rounder',
    'Scott Styris': 'all-rounder',
    'Sean Abbott': 'bowler',
    'Shahbaz Ahmed': 'all-rounder',
    'Shashank Singh': 'batter',
    'Sherfane Rutherford': 'all-rounder',
    'Shoaib Malik': 'all-rounder',
    'Siddharth Chitnis': 'batter',
    'Stuart Binny': 'all-rounder',
    'Sunil Joshi': 'bowler',
    'Swapnil Singh': 'all-rounder',
    'Vikramjeet Malik': 'bowler',
    'Yogesh Nagar': 'batter'
}

def fill_missing_roles(row):
  if row["role"] == "uncategorized":
    return player_roles[row["fullName"]]
  else:
    return row["role"]

batters_df["role"] = batters_df.apply(fill_missing_roles, axis=1)

batters_df["role"].hist()

<Axes: >

def add_bowler_role(bowler_row):
  name = bowler_row['fullName']
  bowl_fp = bowlers[name]

  if in_keepers(name):
      return 'wk'

  elif in_batters(name) and not in_bowlers(name):
      print("rare case 1 occured")
      return 'batter'

  elif in_bowlers(name) and not in_batters(name):
      return 'bowler'

  bat_fp = batters[name]

  if bat_fp >= 25 and bowl_fp >= 20:
      return 'all-rounder'

  elif bat_fp >= 20:
      return 'batter'

  elif bowl_fp >= 15:
      return 'bowler'

  else:
      return 'uncategorized'

bowlers_df["role"] = bowlers_df.apply(add_bowler_role, axis=1)

bowlers_df["role"].hist()

<Axes: >

bowlers_df[bowlers_df["role"] == "uncategorized"]["fullName"].unique()

player_roles = {
    "Abhimanyu Mithun": "bowler",
    "Abhishek Jhunjhunwala": "all‑rounder",
    "Abhishek Nayar": "all‑rounder",
    "Ankeet Chavan": "all‑rounder",
    "Ankit Soni": "bowler",
    "Asad Pathan": "all‑rounder",
    "Balachandra Akhil": "all‑rounder",
    "Chamara Kapugedera": "batter",
    "Dan Christian": "all‑rounder",
    "Daryl Mitchell": "all‑rounder",
    "Deepak Hooda": "all‑rounder",
    "Dinesh Salunkhe": "batter",
    "Hanuma Vihari": "batter",
    "Hrithik Shokeen": "all‑rounder",
    "James Neesham": "all‑rounder",
    "Jayant Yadav": "all‑rounder",
    "Jeevan Mendis": "all‑rounder",
    "Jhye Richardson": "bowler",
    "KP Appanna": "bowler",
    "Karan Goel": "batter",
    "Kartik Tyagi": "bowler",
    "Kyle Abbott": "bowler",
    "Marlon Samuels": "batter",
    "Mohammad Hafeez": "all‑rounder",
    "Pankaj Singh": "bowler",
    "Parvez Rasool": "all‑rounder",
    "Rahul Tewatia": "all‑rounder",
    "Rajagopal Sathish": "all‑rounder",
    "Ramesh Powar": "bowler",
    "Rasikh Salam": "bowler",
    "Sachin Rana": "all‑rounder",
    "Scott Styris": "all‑rounder",
    "Sean Abbott": "bowler",
    "Shahbaz Ahmed": "all‑rounder",
    "Shashank Singh": "batter",
    "Sherfane Rutherford": "all‑rounder",
    "Shoaib Malik": "all-rounder",
    "Siddharth Chitnis": "batter",
    "Stuart Binny": "all‑rounder",
    "Sunil Joshi": "all‑rounder",
    "Swapnil Singh": "all‑rounder",
    "Vikramjeet Malik": "bowler",
    "Yogesh Nagar": "wicket‑keeper"
}

bowlers_df["role"] = bowlers_df.apply(fill_missing_roles, axis=1)

bowlers_df["role"].hist()

<Axes: >

print(bowlers_df["role"].unique())
bowlers_df["role"] = bowlers_df["role"].str.replace('‑', '-', regex=False)
print(bowlers_df["role"].unique())

['batter' 'bowler' 'all‑rounder' 'all-rounder' 'wicket‑keeper']
['batter' 'bowler' 'all-rounder' 'wicket-keeper']

def pick_team_role_based(batter_rows, bowler_rows):

  # top 3 batters
  # top 3 bowlers
  # top 1 wk
  # any combination of batters, bowlers, all-rounders, wk
  combined = pd.merge(batter_rows, bowler_rows, on=['match_id', 'season', 'match_name', 'home_team', 'away_team', 'venue', 'fullName', 'role'], how='outer')
  combined.drop_duplicates(subset=["fullName"], keep='first', inplace=True, ignore_index=False)
  combined["Batting_FP"] = combined["Batting_FP"].fillna(0)
  combined["Bowling_FP"] = combined["Bowling_FP"].fillna(0)
  combined["bat_confidence"] = combined["bat_confidence"].fillna(0)
  combined["bowl_confidence"] = combined["bowl_confidence"].fillna(0)

  combined["Total_FP"] = combined["Batting_FP"] + combined["Bowling_FP"]
  combined["total_confidence"] = combined["bat_confidence"] + combined["bowl_confidence"]

  batters = combined[combined["role"] == 'batter']
  bowlers = combined[combined["role"] == 'bowler']
  wicketkeepers = combined[combined["role"] == 'wk']
  all_rounders = combined[combined["role"] == 'all-rounder']

  pred_batters = batters.sort_values(by="bat_confidence", ascending=False)
  ideal_batters = batters.sort_values(by="Batting_FP", ascending=False)

  pred_bowlers = bowlers.sort_values(by="bowl_confidence", ascending=False)
  ideal_bowlers = bowlers.sort_values(by="Bowling_FP", ascending=False)

  # all keepers are batsmen (or atleast not bowlers)
  pred_wks = wicketkeepers.sort_values(by="bat_confidence", ascending=False)
  ideal_wks = wicketkeepers.sort_values(by="bat_confidence", ascending=False)

  pred_all_rounders = all_rounders.sort_values(by="total_confidence", ascending=False)
  ideal_all_rounders = all_rounders.sort_values(by="Total_FP", ascending=False)

  pred_team = []
  ideal_team = []

  # pick three batters
  pred_team.extend(pred_batters.head(3).to_dict(orient='records'))
  ideal_team.extend(ideal_batters.head(3).to_dict(orient='records'))

  # pick three bowlers
  pred_team.extend(pred_bowlers.head(3).to_dict(orient='records'))
  ideal_team.extend(ideal_bowlers.head(3).to_dict(orient='records'))

  # pick 1 wk
  pred_team.extend(pred_wks.head(1).to_dict(orient='records'))
  ideal_team.extend(ideal_wks.head(1).to_dict(orient='records'))

  # rest four from any category
  pred_combined = combined.sort_values(by="total_confidence")
  ideal_combined = combined.sort_values(by="Total_FP")

  # Create a set of already picked fullNames
  picked_names = {p['fullName'] for p in pred_team}

  # Filter remaining candidates
  remaining_pred = pred_combined[~pred_combined['fullName'].isin(picked_names)].head(4)
  pred_team.extend(remaining_pred.to_dict(orient='records'))

  # Repeat for ideal_team
  picked_ideal_names = {p['fullName'] for p in ideal_team}
  remaining_ideal = ideal_combined[~ideal_combined['fullName'].isin(picked_ideal_names)].head(4)
  ideal_team.extend(remaining_ideal.to_dict(orient='records'))

  return pred_team, ideal_team

my_teams = {}
ideal_teams = {}

for match_id, bat_rows in batting_matches:
  bowl_rows = bowling_matches.get_group(match_id)
  my_teams[match_id], ideal_teams[match_id] = pick_team_role_based(bat_rows, bowl_rows)

pd.set_option('display.max_columns', None) # added for debugging move to top

results = []

for match_id in my_teams:
    model_df = pd.DataFrame(my_teams[match_id])
    ideal_df = pd.DataFrame(ideal_teams[match_id])

    model_df['FP'] = model_df.get('Batting_FP', 0).fillna(0) + model_df.get('Bowling_FP', 0).fillna(0)
    ideal_df['FP'] = ideal_df.get('Batting_FP', 0).fillna(0) + ideal_df.get('Bowling_FP', 0).fillna(0)

    model_score = model_df['FP'].sum()
    ideal_score = ideal_df['FP'].sum()
    efficiency = model_score / ideal_score if ideal_score > 0 else 0

    top_actual = ideal_df.sort_values(by='FP', ascending=False).head(5)['fullName'].values
    top_hits = model_df['fullName'].isin(top_actual).sum()

    top_actual_11 = ideal_df.sort_values(by='FP', ascending=False).head(11)['fullName'].values
    top_hits_11 = model_df['fullName'].isin(top_actual_11).sum()

    venue = model_df['venue'].iloc[0] if 'venue' in model_df.columns else 'unknown'

    results.append({
        'match_id': match_id,
        'venue': venue,
        'model_score': model_score,
        'ideal_score': ideal_score,
        'efficiency': efficiency,
        'top5_hits': top_hits,
        'top11_hits': top_hits_11,
        'batting_contrib': model_df['Batting_FP'].sum(),
        'bowling_contrib': model_df['Bowling_FP'].sum()
    })

results_df = pd.DataFrame(results)

# summary
avg_eff = results_df['efficiency'].mean()
match_80p = (results_df['efficiency'] >= 0.80).mean()

avg_efficiency = results_df['efficiency'].mean()
match_80p = (results_df['efficiency'] >= 0.80).mean()

print(f"Average Efficiency: {avg_efficiency:.2%}")
print(f"% Matches with ≥80% of Ideal Score: {match_80p:.2%}")

Average Efficiency: 90.58%
% Matches with ≥80% of Ideal Score: 75.47%

plt.hist(results_df['efficiency'], bins=20, color='skyblue', edgecolor='black')
# plt.axvline(0.8, color='green', linestyle='--', label='80% Threshold')
plt.axvline(avg_efficiency, color='red', linestyle='--', label=f'Avg = {avg_efficiency:.2%}')
plt.title("Team Efficiency (Model vs Ideal)")
plt.xlabel("Efficiency (Model Score / Ideal Score)")
plt.ylabel("Number of Matches")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

results_df['ideal_batting'] = [
    pd.DataFrame(ideal_teams[mid])['Batting_FP'].fillna(0).sum()
    for mid in results_df['match_id']
]

results_df['ideal_bowling'] = [
    pd.DataFrame(ideal_teams[mid])['Bowling_FP'].fillna(0).sum()
    for mid in results_df['match_id']
]

# averages
print("Model-picked team (average):")
print(f"  Batting FP: {results_df['batting_contrib'].mean():.2f}")
print(f"  Bowling FP: {results_df['bowling_contrib'].mean():.2f}")

print("Ideal team (average):")
print(f"  Batting FP: {results_df['ideal_batting'].mean():.2f}")
print(f"  Bowling FP: {results_df['ideal_bowling'].mean():.2f}")

Model-picked team (average):
  Batting FP: 189.33
  Bowling FP: 172.62
Ideal team (average):
  Batting FP: 221.89
  Bowling FP: 181.17

print(f"Average Top-5 Hit Rate: {results_df['top5_hits'].mean():.2f} out of 5")
print(f"Average Top-11 Hit Rate: {results_df['top11_hits'].mean():.2f} out of 11")

Average Top-5 Hit Rate: 3.50 out of 5
Average Top-11 Hit Rate: 6.96 out of 11

# histogram of top-5 hits (out of 5)
plt.figure(figsize=(8, 5))
plt.hist(results_df['top5_hits'], bins=[0, 1, 2, 3, 4, 5, 6], edgecolor='black', color='mediumseagreen', rwidth=0.8)
plt.title("Distribution of Top-5 FP Players Captured by Model")
plt.xlabel("Number of Top-5 Actual Scorers in Team")
plt.ylabel("Number of Matches")
plt.xticks(range(0, 6))
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.show()

venue_eff = results_df.groupby('venue')['efficiency'].mean().sort_values(ascending=False)

# visualize efficiency by venue
plt.figure(figsize=(10, 6))
venue_eff.plot(kind='bar', color='skyblue')

plt.title('Average Efficiency by Venue', fontsize=14)
plt.xlabel('Venue', fontsize=12)
plt.ylabel('Average Efficiency', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

combined_df = pd.concat([batters_df, bowlers_df], ignore_index=True)
combined_df['Batting_FP'] = combined_df['Batting_FP'].fillna(0)
combined_df['Bowling_FP'] = combined_df['Bowling_FP'].fillna(0)
combined_df['FP'] = combined_df['Batting_FP'] + combined_df['Bowling_FP']
combined_df['confidence'] = combined_df.get('bat_confidence', 0).fillna(0) + combined_df.get('bowl_confidence', 0).fillna(0)

correlation = combined_df[['confidence', 'FP']].corr().iloc[0, 1]
print(f"\nCorrelation between model confidence and actual FP: {correlation:.2f}")

Correlation between model confidence and actual FP: 0.46

relevant_cols = [
    'Batting_FP', 'Bowling_FP', 'bat_confidence', 'bowl_confidence',
    'career_Batting_FP_mean', 'career_Bowling_FP_mean',
    'Batting_FP_rolling5', 'Bowling_FP_rolling5',
    'runs_rolling5', 'wickets_rolling5',
    'career_runs_mean', 'career_wickets_mean'
]

subset = combined_df[relevant_cols].copy()
subset = subset.fillna(0)

correlation_matrix = subset.corr()

print("Correlations with Batting_FP:")
print(correlation_matrix['Batting_FP'].sort_values(ascending=False))

print("\nCorrelations with Bowling_FP:")
print(correlation_matrix['Bowling_FP'].sort_values(ascending=False))

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", square=True)
plt.title("Feature Correlation Matrix")
plt.tight_layout()
plt.show()

Correlations with Batting_FP:
Batting_FP                1.000000
bat_confidence            0.613760
career_runs_mean          0.549312
career_Batting_FP_mean    0.545038
runs_rolling5             0.523421
Batting_FP_rolling5       0.514257
Bowling_FP               -0.269456
Bowling_FP_rolling5      -0.384229
wickets_rolling5         -0.391773
career_Bowling_FP_mean   -0.424548
career_wickets_mean      -0.430301
bowl_confidence          -0.431027
Name: Batting_FP, dtype: float64

Correlations with Bowling_FP:
Bowling_FP                1.000000
bowl_confidence           0.693009
career_wickets_mean       0.515283
career_Bowling_FP_mean    0.510844
wickets_rolling5          0.487790
Bowling_FP_rolling5       0.480965
Batting_FP               -0.269456
Batting_FP_rolling5      -0.379710
runs_rolling5            -0.392347
career_Batting_FP_mean   -0.428835
bat_confidence           -0.430064
career_runs_mean         -0.437954
Name: Bowling_FP, dtype: float64

	match_id	season	match_name	home_team	away_team	venue	bowling_team	batting_team	batting_innings	fullName	batting_position	runs	balls	fours	sixes	strike_rate	Batting_FP
0	1359475	2023	GT v CSK	GT	CSK	Narendra Modi Stadium, Motera, Ahmedabad	GT	CSK	1	Devon Conway	1	1	6	0	0	16.66	1
1	1359475	2023	GT v CSK	GT	CSK	Narendra Modi Stadium, Motera, Ahmedabad	GT	CSK	1	Ruturaj Gaikwad	2	92	50	4	9	184.00	128
2	1359475	2023	GT v CSK	GT	CSK	Narendra Modi Stadium, Motera, Ahmedabad	GT	CSK	1	Moeen Ali	3	23	17	4	1	135.29	31
3	1359475	2023	GT v CSK	GT	CSK	Narendra Modi Stadium, Motera, Ahmedabad	GT	CSK	1	Ben Stokes	4	7	6	1	0	116.66	8
4	1359475	2023	GT v CSK	GT	CSK	Narendra Modi Stadium, Motera, Ahmedabad	GT	CSK	1	Ambati Rayudu	5	12	12	0	1	100.00	14

	Avg Target = 1	Max Target = 1	Min Target = 1	Avg Target = 0	Max Target = 0	Min Target = 0	Avg Players per Match	Max Players per Match	Min Players per Match
target	3.496839	8.0	0.0	10.804046	21.0	1.0	14.300885	22.0	3.0
predicted_target_by_RandomForestClassifier	7.509482	11.0	1.0	6.791403	16.0	0.0	14.300885	22.0	3.0
predicted_target_by_XGBoost	6.102402	11.0	0.0	8.198483	18.0	0.0	14.300885	22.0	3.0
predicted_target_by_Logistic Regression	6.867257	10.0	2.0	7.433628	16.0	0.0	14.300885	22.0	3.0
predicted_target_by_CatBoost	6.106195	11.0	1.0	8.194690	18.0	0.0	14.300885	22.0	3.0
predicted_target_by_LightGBM	6.260430	11.0	1.0	8.040455	17.0	0.0	14.300885	22.0	3.0
predicted_target_by_Extra Trees	7.677623	11.0	1.0	6.623262	15.0	0.0	14.300885	22.0	3.0
predicted_target_by_Balanced Bagging	8.246523	11.0	2.0	6.054362	15.0	0.0	14.300885	22.0	3.0

	Avg Target = 1	Max Target = 1	Min Target = 1	Avg Target = 0	Max Target = 0	Avg Players per Match	Max Players per Match	Min Players per Match
target	3.867257	10.0	0.0	7.332491	13.0	11.199747	16.0	4.0
predicted_target_by_RandomForestClassifier	6.924147	10.0	1.0	4.275601	11.0	11.199747	16.0	4.0
predicted_target_by_XGBoost	6.962073	11.0	1.0	4.237674	11.0	11.199747	16.0	4.0
predicted_target_by_Logistic Regression	6.328698	11.0	1.0	4.871049	12.0	11.199747	16.0	4.0
predicted_target_by_CatBoost	7.317320	12.0	1.0	3.882427	10.0	11.199747	16.0	4.0
predicted_target_by_LightGBM	4.823009	10.0	0.0	6.376738	12.0	11.199747	16.0	4.0
predicted_target_by_Extra Trees	7.230088	11.0	1.0	3.969659	11.0	11.199747	16.0	4.0
predicted_target_by_Balanced Bagging	7.243995	11.0	1.0	3.955752	11.0	11.199747	16.0	4.0

Fantasy Sports Prediction in Cricket¶

Summer 2025 Data Science Project¶

Contributions:¶

Introduction¶

Data Curation¶

What We Got¶

Data importing and cleaning (batters)¶

Batting Data Structure¶

Data importing and cleaning (now for bowlers)¶

Bowling Data Structure¶

Data Cleaning continued: Venue & Team Filtering¶

Exploratory Data Analysis (EDA)¶

Key Questions:¶

Match Counts by Venue¶

Data Exploration¶

Distribution of Batting Fantasy Points¶

Match-Wise Fantasy Points: Batting vs Bowling¶

Venue-Wise Batting Performance¶

Hypothesis Testing: Home Ground Advantage¶

Data Leakage¶

Feature Engineering¶

1. Recent Form (rolling5_*)¶

2. Career Averages (career_*)¶

3. Match Conditions¶

Why This Matters¶

Feature Engineering¶

Career Averages¶

How It Works¶

Why It’s Valuable¶

Correlation with Fantasy Points (Batting)¶

Correlation Observations for Batters¶

Correlation with Fantasy Points (Bowling)¶

Correlation Observations for Bowlers¶

Machine Learning: Predicting Fantasy Points¶

Target Variables¶

Features Used¶

Models Trained¶

Batting_FP regression¶

Predicting Bowling Fantasy Scores¶

Here comes the twist¶

Helper Functions¶

Model Evaluation Strategy¶

Custom Model Pipeline¶

Classifying Batters¶

Interpreting the Outputs¶

Match-Level Prediction Breakdown¶

Target Distribution Insights¶

Model Tuning: XGBoost Optimization¶

Custom Evaluation Function¶

Custom Grid Search function¶

Finals Batters Model¶

Final Model Output Summary¶

Building a batters team of 5/6 players¶

Building a Batters Team of 5/6 Players¶

Efficiency of Predicted vs. Actual Top Batters¶

Distribution of Match Score Efficiencies¶

Predicting best bowlers¶

Training classifiers for bowlers¶

Match-Level Accuracy Comparison¶

Bowlers Model Evaluation Results¶

Making the best possible LightGBM model¶

Building a bowlers team of 5¶

FINALLY: making a real team (the real deal)¶

Assigning Player Roles: Batters, Bowlers, All-rounders, and Wicketkeepers¶

Building a Fantasy Team¶

Visualization & Result Analysis¶

Efficiency Distribution¶

Top 5 Player Hits¶

Venue-Wise Accuracy¶

Correlation Insights¶

Insights and Conclusions¶

1. Recent Form (`rolling5_*`)¶

2. Career Averages (`career_*`)¶