eda_on_exam_data / perform_eda.py
eagle0504's picture
Upload folder using huggingface_hub
3e50e1d verified
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
df = pd.read_csv('/kaggle/input/students-performance-in-exams/StudentsPerformance.csv')
# --- EDA ---
# Basic Information
print("Dataset Info:")
df.info()
print("\n" + "="*50 + "\n")
print("Descriptive Statistics for Scores:")
print(df[['math score', 'reading score', 'writing score']].describe())
print("\n" + "="*50 + "\n")
# Missing Values (already checked, but good to confirm programmatically)
print("Missing values per column:")
print(df.isnull().sum())
print("\n" + "="*50 + "\n")
# --- Visualizations ---
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(18, 18))
fig.suptitle('Exploratory Data Analysis of Student Performance', fontsize=20)
# Distribution of Scores
sns.histplot(df['math score'], kde=True, ax=axes[0, 0], color='skyblue')
axes[0, 0].set_title('Distribution of Math Scores')
sns.histplot(df['reading score'], kde=True, ax=axes[0, 1], color='lightcoral')
axes[0, 1].set_title('Distribution of Reading Scores')
sns.histplot(df['writing score'], kde=True, ax=axes[1, 0], color='lightgreen')
axes[1, 0].set_title('Distribution of Writing Scores')
# Scores by Gender
sns.boxplot(x='gender', y='math score', data=df, ax=axes[1, 1])
axes[1, 1].set_title('Math Score by Gender')
# Scores by Parental Level of Education (using a combined score)
df['average score'] = df[['math score', 'reading score', 'writing score']].mean(axis=1)
sns.boxplot(x='parental level of education', y='average score', data=df, ax=axes[2, 0])
axes[2, 0].set_title('Average Score by Parental Education Level')
axes[2, 0].tick_params(axis='x', rotation=45)
# Scores by Test Preparation Course
sns.boxplot(x='test preparation course', y='average score', data=df, ax=axes[2, 1])
axes[2, 1].set_title('Average Score by Test Preparation Course')
plt.tight_layout(rect=[0, 0.03, 1, 0.96]) # Adjust layout to prevent suptitle overlap
plt.savefig('students_performance_eda.png')
print("EDA plots saved to students_performance_eda.png")