import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the data from a TSV file
file_path = 'cpet_clinical_data.tsv'
df = pd.read_csv(file_path, sep='\t')
df_matched = df.dropna(subset=['matched_pair']).copy()
metrics = [
'DBP', # Diastolic Blood Pressure
'HR', # Heart Rate
#'OUES', # Oxygen Uptake Equivalent Slope
'PETCO2', # End-Tidal CO2
'PETO2', # End-Tidal O2
'PP', # Pulse Pressure
'RER', # Respiratory Exchange Ratio
'RPE', # Rating of Perceived Exertion
'RPM', # Pedal RPM
'RPP', # Rate Pressure Product
'RR', # Respiratory Rate
'SBP', # Systolic Blood Pressure
'VCO2', # Carbon Dioxide Production
'VO2', # Oxygen Consumption
'VO2_HR', # Oxygen Consumption per Heart Rate
'VO2_t', # Oxygen consumption (ml/min)
'Ve_BTPS', # Ventilation (BTPS)
'Ve_VCO2', # Ventilatory Equivalent for CO2
'Ve_VO2', # Ventilatory Equivalent for O2
'Vt_BTPS_L', # Tidal Volume (BTPS in Liters)
'wkld', # Workload
]
# Convert all metric columns to numeric, coercing errors
for metric in metrics:
df_matched[metric] = pd.to_numeric(df_matched[metric], errors='coerce')
# Split the data by time points
time_points = ['AT', 'max', 'rest']
# Initialize an empty list to store differences
diff_data = []
for time_point in time_points:
# Filter data for the current time point
df_tp = df_matched[df_matched['Time_Point'] == time_point]
# Split the data into Day 1 and Day 2
day1 = df_tp[df_tp['Study_Visit'] == 'D1']
day2 = df_tp[df_tp['Study_Visit'] == 'D2']
# Merge day1 and day2 on ParticipantID
merged = pd.merge(day1, day2, on=['ParticipantID', 'matched_pair', 'sex', 'Time_Point', 'race', 'phenotype'], suffixes=('_D1', '_D2'))
# Calculate the percentage difference for each metric
for metric in metrics:
merged[metric + '_pct_diff'] = ((merged[metric + '_D2'] - merged[metric + '_D1']) / merged[metric + '_D1']) * 100
# Keep only relevant columns and add time point information
diff_columns = ['ParticipantID', 'matched_pair', 'sex', 'Time_Point', 'phenotype'] + [metric + '_pct_diff' for metric in metrics]
merged['Time_Point'] = time_point
diff_data.append(merged[diff_columns])
# Combine the differences for all time points
cpet_diff = pd.concat(diff_data)
# Define the metric you want to plot
metric_to_plot = 'PETCO2' # Change this to the metric you want to plot
timepoint_to_plot = 'max' # Change this to the time point you want to plot [max, AT, rest]
# Filter the data for the chosen time point
timepoint_data = cpet_diff[cpet_diff['Time_Point'] == timepoint_to_plot].copy()
# Separate 'HC' phenotype data and sort it by the selected metric
hc_data = timepoint_data[timepoint_data['phenotype'] == 'HC']
hc_data_sorted = hc_data.sort_values(by=metric_to_plot + '_pct_diff')
# Create a mapping from matched_pair to its sorted index
matched_pair_order = {mp: i for i, mp in enumerate(hc_data_sorted['matched_pair'])}
# Reorder the 'matched_pair' column based on the sorted indices
timepoint_data['matched_pair_order'] = timepoint_data['matched_pair'].map(matched_pair_order)
# Set up the plot
plt.figure(figsize=(12, 8))
# Create a scatter plot for the 'AT' time point, ordered by matched_pair_order
sns.scatterplot(
data=timepoint_data,
x='matched_pair_order',
y=metric_to_plot + '_pct_diff',
hue='phenotype',
palette={'MECFS': 'red', 'HC': 'blue'},
s=100
)
# Customizing the plot
plt.xlabel('Matched Pair')
plt.ylabel(f'Percentage Difference in {metric_to_plot}')
plt.title(f'Percentage Difference in {metric_to_plot} for Time Point: {timepoint_to_plot}')
# Rotate x-axis labels to prevent overlap
plt.xticks(ticks=range(len(matched_pair_order)), labels=[int(mp) for mp in matched_pair_order.keys()], rotation=45, ha='right')
plt.legend(title='Phenotype', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.tight_layout()
plt.savefig(f'{metric_to_plot}_{timepoint_to_plot}_pct_diff_plot.png', bbox_inches='tight')
# Show the plot
plt.show()
#####
##### The following is just to make the percentage of MECFS at max that increased VO2.
#####
# Filter the original df for ME/CFS participants at the AT time point
mecfs_at_data = df[(df['phenotype'] == 'MECFS') & (df['Time_Point'] == 'max')]
# Split the data into Day 1 (D1) and Day 2 (D2)
day1 = mecfs_at_data[mecfs_at_data['Study_Visit'] == 'D1']
day2 = mecfs_at_data[mecfs_at_data['Study_Visit'] == 'D2']
# Merge Day 1 and Day 2 data on ParticipantID
merged_mecfs = pd.merge(day1, day2, on=['ParticipantID', 'Time_Point'], suffixes=('_D1', '_D2'))
# Calculate the percentage of ME/CFS participants where VO2 increased from D1 to D2
total_mecfs = merged_mecfs.shape[0]
mecfs_increased = merged_mecfs[merged_mecfs['VO2_D2'] > merged_mecfs['VO2_D1']].shape[0]
# Calculate the percentage
if total_mecfs > 0:
percent_increased = (mecfs_increased / total_mecfs) * 100
else:
percent_increased = 0
# Print the result
print(f"Percentage of ME/CFS participants with increased VO2 at max from D1 to D2: {percent_increased:.2f}%")