I was given a dataset about an online game that made money through microtransactions. The csv dataset was reasonably short (43kb) with seven columns, each containing important information. Luckily, there was no missing data. I was asked to interpret the data, and provide actionable insights about the demographics and most profitable items for in-game purchases.
Here, I used Pandas in a Jupyter Notebook.
Of the 1163 active players, the vast majority are male (84%). There also exists, a smaller, but notable proportion of female players (14%).
Our peak age demographic falls between 20-24 (44.8%) with secondary groups falling between 15-19 (18.60%) and 25-29 (13.4%).
# Dependencies and Setup
import pandas as pd
import numpy as np
# File to Load (Remember to Change These)
file_to_load = "Resources/purchase_data.csv"
# Read Purchasing File and store into Pandas data frame
df = pd.read_csv(file_to_load)
df.head(10)
# Calculate the Number of Unique Players
player_demographics = df.loc[:, ["Gender", "SN", "Age"]]
player_demographics = player_demographics.drop_duplicates()
num_players = player_demographics.count()[0] # Display the total number of players
pd.DataFrame({"Total Players": [num_players]})
unique_item_count = len(df['Item ID'].unique())
average_price_of_items = round(float(df['Price'].mean()), 2)
count_of_purchases = len(df['Price'])
price_sum = float(df['Price'].sum())
price_sum
summary_dataframe = pd.DataFrame({
'Number of Unique Items': [unique_item_count],
'Average Price': '$' + str(average_price_of_items),
'Number of Purchases': [count_of_purchases],
'Total Revenue': '$' + str(price_sum)
})
summary_dataframe
df_gender1 = df[['Gender','SN']].drop_duplicates(subset = 'SN')
gender_count = df_gender1['Gender'].value_counts(0)
gender_percent = df_gender1['Gender'].value_counts(1)
gender_count_df = pd.DataFrame(gender_count)
gender_percent_df = round(pd.DataFrame(gender_percent) * 100, 2)
gender_summary_df = gender_count_df.merge(gender_percent_df, left_index = True, right_index = True)
gender_summary_df.columns = ['Total Count', 'Percentage of Players']
gender_summary_df
df_gender_2 = df.groupby('Gender')
purchase_count = round(df_gender_2['Purchase ID'].count(), 0)
avg_purchase_price = round(df_gender_2['Price'].mean(), 2)
total_purchase_value = round(df_gender_2['Price'].sum(), 2)
purchase_value_per_gender = round(total_purchase_value / gender_count, 2)
summary_dataframe2 = pd.DataFrame([purchase_count, avg_purchase_price, total_purchase_value, purchase_value_per_gender])
summary2 = summary_dataframe2.T
summary2.columns = ['Purchase Count', 'Average Purchase Price', 'Total Purchase Value', 'Avg Total Purchase per Person']
summary2
bins = [0, 9, 14, 19, 24, 29, 34, 39, 150]
bin_labels = ['<10', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40+']
df["Total Count"] = pd.cut(df["Age"], bins, labels=bin_labels)
df_age1 = df[['Total Count','SN']].drop_duplicates(subset = 'SN')
age_demographics_summary = df_age1.groupby("Total Count").count()
age_counts = age_demographics_summary['SN']
age_demographics_percentages = round(age_counts / 576 * 100, 2)
age_demographics_percentages
summary_dataframe3 = pd.DataFrame([age_counts, age_demographics_percentages])
summary_data = summary_dataframe3.T
summary_data.columns = ['Total Count', 'Percentage of Players']
summary_data.head()
bins = [0, 9, 14, 19, 24, 29, 34, 39, 150]
bin_labels = ['<10', '10-14', '15-19', '20-24', '25-29', '30-34', '35-39', '40+']
df["Total Count"] = pd.cut(df["Age"], bins, labels=bin_labels)
less_rows = df[['Total Count','SN', 'Price']]
less_rows_grouped = less_rows.groupby('Total Count')
purchase_counts = less_rows_grouped['Price'].count()
average_prices = round(less_rows_grouped['Price'].mean(),2)
total_spent = round(less_rows_grouped['Price'].sum(), 2)
spending_per_person = round(total_spent/age_counts,2)
summary4 = pd.DataFrame([purchase_counts, average_prices, total_spent, spending_per_person])
summary4b = summary4.T
summary4b.columns = ['Purchase Count', 'Average Purchase Price', 'Total Purchase Value', 'Avg Total Purchase per Person']
summary4b.head()
df_sn_2 = df.groupby('SN')
purchase_counts = df_sn_2['Gender'].count()
average_spending = round(df_sn_2['Price'].mean(),2)
total_purchase = round(df_sn_2['Price'].sum(),2)
summary6 = pd.DataFrame([purchase_counts, average_spending, total_purchase])
summary7 = summary6.T
summary7.columns = ['Purchase Count', 'Average Purchase Price', 'Total Purchase Value']
summary7.sort_values('Total Purchase Value', ascending=False).reset_index().head()
df_sn_3 = df.groupby(['Item ID', 'Item Name'])
purchase_counts2 = df_sn_3['Gender'].count()
average_spending2 = round(df_sn_3['Price'].mean(),2)
total_purchase2 = round(df_sn_3['Price'].sum(),2)
summary7 = pd.DataFrame([purchase_counts2, average_spending2, total_purchase2])
summary8 = summary7.T
summary8.columns = ['Purchase Count', 'Item Price', 'Total Purchase Value']
summary9 = summary8.sort_values('Purchase Count', ascending=False)
summary9.head()
summary8.sort_values('Total Purchase Value', ascending=False).head()