I was given two datasets with data relating to mouse trials for anti-cancer drugs. This data is smiliar to what would be obtained from a medical research lab. I was asked to interpret the data, and exactly reproduce three line plots, and a special custom plot.
I used Pandas and MatPlotLib in a Jupyter Notebook.
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')
# File to Load (Remember to Change These)
mouse_drug_data = pd.read_csv("data/mouse_drug_data.csv")
clinical_trial_data = pd.read_csv("data/clinicaltrial_data.csv")
df = pd.merge(clinical_trial_data, mouse_drug_data, how = "left", on=["Mouse ID","Mouse ID"])
df.head()
# Store the Mean Tumor Volume Data Grouped by Drug and Timepoint
tumor_vols_mean = df.groupby(["Drug", "Timepoint"]).mean()["Tumor Volume (mm3)"]
# Convert to DataFrame
tumor_vols_mean_df = pd.DataFrame(tumor_vols_mean)
tumor_vols_mean_df = tumor_vols_mean_df.reset_index()
# Preview DataFrame
tumor_vols_mean_df.head()
# Store the Standard Error of Tumor Volumes Grouped by Drug and Timepoint
tumor_vols_se = df.groupby(["Drug", "Timepoint"]).sem()["Tumor Volume (mm3)"]
# Convert to DataFrame
tumor_vols_se_df = pd.DataFrame(tumor_vols_se)
tumor_vols_se_df = tumor_vols_se_df.reset_index()
# Preview DataFrame
tumor_vols_se_df.head()
# Convert data from long to wide format
tumor_vols_mean_df_wide = tumor_vols_mean_df.pivot(index="Timepoint", columns="Drug")["Tumor Volume (mm3)"]
tumor_vols_se_df_wide = tumor_vols_se_df.pivot(index="Timepoint", columns="Drug")["Tumor Volume (mm3)"]
# Preview that Reformatting worked
tumor_vols_mean_df_wide.head()
# Generate the Plot (with Error Bars)
# Since we set the index to timepoint, we can use that as our x value.
plt.errorbar(tumor_vols_mean_df_wide.index, tumor_vols_mean_df_wide["Capomulin"], yerr=tumor_vols_se_df_wide["Capomulin"], color="r", marker="o", markersize=5, linestyle="dashed", linewidth=0.50)
plt.errorbar(tumor_vols_mean_df_wide.index, tumor_vols_mean_df_wide["Infubinol"], yerr=tumor_vols_se_df_wide["Infubinol"], color="b", marker="^", markersize=5, linestyle="dashed", linewidth=0.50)
plt.errorbar(tumor_vols_mean_df_wide.index, tumor_vols_mean_df_wide["Ketapril"], yerr=tumor_vols_se_df_wide["Ketapril"], color="g", marker="s", markersize=5, linestyle="dashed", linewidth=0.50)
plt.errorbar(tumor_vols_mean_df_wide.index, tumor_vols_mean_df_wide["Placebo"], yerr=tumor_vols_se_df_wide["Placebo"], color="k", marker="d", markersize=5, linestyle="dashed", linewidth=0.50)
plt.title("Tumor Response to Treatment")
plt.ylabel("Tumor Volume (mm3)")
plt.xlabel("Time (Days)")
plt.grid(True)
plt.legend(loc="best", fontsize="small", fancybox=True)
# Save the Figure
# Save the Figure
plt.savefig("analysis/Fig1.png")
# Show the Figure
plt.show()
# Store the Mean Met. Site Data Grouped by Drug and Timepoint
metastatic_response_mean = df.groupby(["Drug", "Timepoint"]).mean()["Metastatic Sites"]
# Convert to DataFrame
metastatic_response_mean_df = pd.DataFrame(metastatic_response_mean)
# Preview DataFrame
metastatic_response_mean_df.head()
# Store the Standard Error associated with Met. Sites Grouped by Drug and Timepoint
metastatic_response_se = df.groupby(["Drug", "Timepoint"]).sem()["Metastatic Sites"]
# Convert to DataFrame
metastatic_response_se_df = pd.DataFrame(metastatic_response_se)
# Preview DataFrame
metastatic_response_se_df.head()
# Minor Data Munging to Re-Format the Data Frames
metastatic_response_mean_df2 = metastatic_response_mean_df.reset_index()
metastatic_response_mean_df_wide = metastatic_response_mean_df2.pivot(index="Timepoint", columns="Drug")["Metastatic Sites"]
metastatic_response_se_df2 = metastatic_response_se_df.reset_index()
metastatic_response_se_df_wide = metastatic_response_se_df2.pivot(index="Timepoint", columns="Drug")["Metastatic Sites"]
# Preview that Reformatting worked
metastatic_response_mean_df_wide.head()
plt.errorbar(metastatic_response_mean_df_wide.index, metastatic_response_mean_df_wide["Capomulin"], yerr=metastatic_response_se_df_wide["Capomulin"], color="r", marker="o", markersize=5, linestyle="dashed", linewidth=0.50)
plt.errorbar(metastatic_response_mean_df_wide.index, metastatic_response_mean_df_wide["Infubinol"], yerr=metastatic_response_se_df_wide["Infubinol"], color="b", marker="^", markersize=5, linestyle="dashed", linewidth=0.50)
plt.errorbar(metastatic_response_mean_df_wide.index, metastatic_response_mean_df_wide["Ketapril"], yerr=metastatic_response_se_df_wide["Ketapril"], color="g", marker="s", markersize=5, linestyle="dashed", linewidth=0.50)
plt.errorbar(metastatic_response_mean_df_wide.index, metastatic_response_mean_df_wide["Placebo"], yerr=metastatic_response_se_df_wide["Placebo"], color="k", marker="d", markersize=5, linestyle="dashed", linewidth=0.50)
plt.title("Metastatic Spread During Treatment")
plt.ylabel("Met. Sites")
plt.xlabel("Time (Days)")
plt.grid(True)
plt.legend(loc="best", fontsize="small", fancybox=True)
# Save the Figure
# Save the Figure
plt.savefig("analysis/Fig2.png")
# Show the Figure
plt.show()
# Store the Count of Mice Grouped by Drug and Timepoint (W can pass any metric)
mice_still_alive = df.groupby(["Drug", "Timepoint"]).count()["Tumor Volume (mm3)"]
# Convert to DataFrame
mice_still_alive_df = pd.DataFrame(mice_still_alive)
# Note: Resetting the index here fills in the "Drug" column with repetitions automatically.
# Otherwise, it would retain groupby object structure.
mice_still_alive_df.head().reset_index()
# Minor Data Munging to Re-Format the Data Frames
mice_still_alive_df2 = mice_still_alive_df.reset_index()
mice_still_alive_df_wide = mice_still_alive_df2.pivot(index="Timepoint", columns="Drug")["Tumor Volume (mm3)"]
# Preview the Data Frame
mice_still_alive_df_wide.head()
# Generate the Plot (Accounting for percentages)
plt.plot(100 * mice_still_alive_df_wide["Capomulin"] / 25, "ro", linestyle="dashed", markersize=5, linewidth=0.50)
plt.plot(100 * mice_still_alive_df_wide["Infubinol"] / 25, "b^", linestyle="dashed", markersize=5, linewidth=0.50)
plt.plot(100 * mice_still_alive_df_wide["Ketapril"] / 25, "gs", linestyle="dashed", markersize=5, linewidth=0.50)
plt.plot(100 * mice_still_alive_df_wide["Placebo"] / 25 , "kd", linestyle="dashed", markersize=6, linewidth=0.50)
plt.title("Mice Survival Rates During Treatment")
plt.ylabel("Survival Rate (%)")
plt.xlabel("Time (Days)")
plt.grid(True)
plt.legend(loc="best", fontsize="small", fancybox=True)
# Save the Figure
plt.savefig("analysis/Fig3.png")
# Show the Figure
plt.show()
# Calculate the percent changes for each drug
tumor_pct_change = 100 * (tumor_vols_mean_df_wide.iloc[-1] - tumor_vols_mean_df_wide.iloc[0]) / tumor_vols_mean_df_wide.iloc[0]
# Display the data to confirm
tumor_pct_change
# Store all Relevant Percent Changes into a Tuple
pct_changes = (tumor_pct_change["Capomulin"],
tumor_pct_change["Infubinol"],
tumor_pct_change["Ketapril"],
tumor_pct_change["Placebo"])
# Splice the data between passing and failing drugs
fig, ax = plt.subplots()
ind = np.arange(len(pct_changes))
width = 1
rectsPass = ax.bar(ind[0], pct_changes[0], width, color='green')
rectsFail = ax.bar(ind[1:], pct_changes[1:], width, color='red')
# Orient widths. Add labels, tick marks, etc.
ax.set_ylabel('% Tumor Volume Change')
ax.set_title('Tumor Change Over 45 Day Treatment')
ax.set_xticks(ind + 0.5)
ax.set_xticklabels(('Capomulin', 'Infubinol', 'Ketapril', 'Placebo'))
ax.set_autoscaley_on(False)
ax.set_ylim([-30,70])
ax.grid(True)
# Use functions to label the percentages of changes
def autolabelFail(rects):
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., 3,
'%d%%' % int(height),
ha='center', va='bottom', color="white")
def autolabelPass(rects):
for rect in rects:
height = rect.get_height()
ax.text(rect.get_x() + rect.get_width()/2., -8,
'-%d%% ' % int(height),
ha='center', va='bottom', color="white")
# Call functions to implement the function calls
autolabelPass(rectsPass)
autolabelFail(rectsFail)
# Save the Figure
fig.savefig("analysis/Fig4.png")
# Show the Figure
fig.show()