# Import general data manipulation libraries
import pandas as pd
import numpy as np
# Import IFrame inline display
from IPython.display import IFrame


# Import full CSV dataset 
survival_data = pd.read_csv("frmgham2_me_cohort-1.csv", sep=';')
print(survival_data)

       RANDID  SEX  TOTCHOL  AGE  SYSBP DIABP  CURSMOKE  CIGPDAY    BMI  \
0        2448    1    195.0   39    106    70         0      0.0  26,97   
1        6238    2    250.0   46    121    81         0      0.0  28,73   
2        9428    1    245.0   48  127,5    80         1     20.0  25,34   
3       10552    2    225.0   61    150    95         1     30.0  28,58   
4       11252    2    285.0   46    130    84         1     23.0   23,1   
...       ...  ...      ...  ...    ...   ...       ...      ...    ...   
4429  9990894    2    248.0   48    131    72         1     20.0     22   
4430  9993179    2    210.0   44  126,5    87         1     15.0  19,16   
4431  9995546    2    269.0   52  133,5    83         0      0.0  21,47   
4432  9998212    1    185.0   40    141    98         0      0.0   25,6   
4433  9999312    2    196.0   39    133    86         1     30.0  20,91   

      DIABETES  ...  CVD  HYPERTEN  TIMEAP  TIMEMI  TIMEMIFC  TIMECHD  \
0            0  ...    1         0    8766    6438      6438     6438   
1            0  ...    0         0    8766    8766      8766     8766   
2            0  ...    0         0    8766    8766      8766     8766   
3            0  ...    1         1    2956    2956      2956     2956   
4            0  ...    0         1    8766    8766      8766     8766   
...        ...  ...  ...       ...     ...     ...       ...      ...   
4429         0  ...    0         1    6433    6433      6433     6433   
4430         0  ...    0         1    6729    6729      6729     6729   
4431         0  ...    1         1    5939    8766      5209     5209   
4432         0  ...    0         1    8766    8766      8766     8766   
4433         0  ...    0         1    8766    8766      8766     8766   

      TIMESTRK  TIMECVD  TIMEDTH  TIMEHYP  
0         8766     6438     8766     8766  
1         8766     8766     8766     8766  
2         8766     8766     8766     8766  
3         2089     2089     2956        0  
4         8766     8766     8766     4285  
...        ...      ...      ...      ...  
4429      6433     6433     6433     2219  
4430      6729     6729     6729     4396  
4431      8766     5209     8766      735  
4432      8766     8766     8766        0  
4433      8766     8766     8766     4201  

[4434 rows x 39 columns]


IFrame("documents/FraminghamDataDocumentation.pdf", width=750, height=400)


# Import Kaplan-Meier survival method
from lifelines import KaplanMeierFitter
kmf = KaplanMeierFitter()


# Choose time variable from full dataset
T = survival_data["TIMEDTH"]/365
# Choose event variable from full dataset
E = survival_data["DEATH"]


# Import plotting library
from matplotlib import pyplot as plt
import matplotlib.ticker as mtick


%matplotlib inline


# Define graph size
plt.figure(figsize=(10,5))
# Define plot
ax = plt.subplot(111)
# Additional markers
plt.ylabel("Number of Fatalities")
plt.xlabel("(Years)")
# Additional Infos
plt.title("Framingham-Study (1948 ff.): Mortality Events Over Time", color = "#4C9900")
plt.text(18,5, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Plot histogram
plt.hist(T, bins = 24, range = (0,24), histtype='step')

(array([ 31.,  28.,  30.,  41.,  46.,  53.,  49.,  46.,  48.,  57.,  65.,
         70.,  60.,  75.,  78.,  83.,  82.,  81.,  72.,  90., 107.,  82.,
         85.,  91.]),
 array([ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11., 12.,
        13., 14., 15., 16., 17., 18., 19., 20., 21., 22., 23., 24.]),
 [<matplotlib.patches.Polygon at 0x12f7e0790>])


# Define graph size
plt.figure(figsize=(7,6))
# Define plot
ax = plt.subplot(111)
# Set percent formatter in order to display percent instead of float on y-axis
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, None,'%'))
# Fit and plot model
kmf.fit(T, event_observed=E, label='surviving')
kmf.plot_survival_function(ax=ax)
plt.grid(visible=True, which='both', axis='both', color='#C0C0C0', linestyle='-', linewidth=0.25)
# Additional Infos
plt.text(0, 0.65, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
plt.xlabel("(Years)")
plt.title("Framingham-Study (1948 ff.): Overall Cohort Survival", color = "#4C9900")

Text(0.5, 1.0, 'Framingham-Study (1948 ff.): Overall Cohort Survival')


# Define graph size
plt.figure(figsize=(7,6))
# Define plot
ax = plt.subplot(111)
# Set percent formatter in order to display percent instead of float on y-axis
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, None,'%'))
# Iterate discriminator for diabetes status and plot respective survival curves
value_range = [0, 1]
for value in value_range:
    disc = (survival_data["DIABETES"] == value)
    kmf.fit(T[disc], event_observed=E[disc], label=value)
    kmf.plot_survival_function(ax=ax)
# Add ticks, grid, title and labels
plt.tick_params(labeltop=False, labelright=True, right=True)
plt.xticks(np.arange(0, 25, step=1), fontsize = 10)  # Set label locations.
plt.grid(visible=True, which='both', axis='both', color='#C0C0C0', linestyle='-', linewidth=0.25)
plt.title("Framingham-Study (1948 ff.): Survival Condition Diabetes mellitus", color = "#4C9900")
plt.xlabel("(Years)")
# Additional Info: license
plt.text(0, 0.2, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Set layout to fill canvas
plt.tight_layout()


# Choose time variables from full dataset
TM = survival_data["TIMEMIFC"]/365
TS = survival_data["TIMESTRK"]/365
# Choose event variables from full dataset
EM = survival_data["MI_FCHD"]
ES = survival_data["STROKE"]


# Define plot
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize=(9,6))
fig.suptitle("Event-Free Survival of Diabetic vs. Nondiabetic Subjects")

# Subplot 1:
# Set percent formatter in order to display percent instead of float on y-axis
ax1.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, None,'%'))
# Iterate discriminator for diabetes status and plot respective survival curves
value_range = [0, 1]
for value in value_range:
    disc = (survival_data["DIABETES"] == value)
    kmf.fit(TM[disc], event_observed=EM[disc], label=value)
    kmf.plot_survival_function(ax=ax1)
# Add ticks, grid, title and labels
ax1.tick_params(labeltop=False, labelright=False, right=False)
ax1.set_title("Myocardial Infarction", color = "#4C9900")
ax1.set(xlabel='(Years)')
# Set layout to fill canvas
plt.tight_layout()

# Subplot 2:
# Set percent formatter in order to display percent instead of float on y-axis
ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, None,'%'))
# Iterate discriminator for diabetes status and plot respective survival curves
value_range = [0, 1]
for value in value_range:
    disc = (survival_data["DIABETES"] == value)
    kmf.fit(TS[disc], event_observed=ES[disc], label=value)
    kmf.plot_survival_function(ax=ax2)
# Add ticks, grid, title and labels
ax2.tick_params(labeltop=False, labelright=True, right=True)
ax2.set_title("Stroke", color = "#4C9900")
ax2.set(xlabel='(Years)')
# Additional Info: license
plt.text(8, 0.35, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Set layout to fill canvas
plt.tight_layout()


# Define Figure
plt.figure(figsize=(8,7))
# Define survival plot
ax = plt.subplot(111)
# Set percent formatter in order to display percent instead of float
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, None,'%'))
# Iterate discriminator
value_range = [1, 2, 3, 4]
for value in value_range:
    disc = (survival_data["educ"] == value)
    kmf.fit(T[disc], event_observed=E[disc], label=value)
    kmf.plot_survival_function(ax=ax)
# Add ticks, grid, title and labels
plt.tick_params(labeltop=False, labelright=True, right=True)
plt.xticks(np.arange(0, 25, step=1), fontsize = 10)  # Set label locations.
plt.grid(visible=True, which='both', axis='both', color='#C0C0C0', linestyle='-', linewidth=0.25)
plt.title("Framingham Study (1948 ff.): Survival Condition Educational Status", color = "#4C9900")
plt.xlabel("(Years)")
# Additional Info: license
plt.text(0, 0.55, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Additional Info: Discriminator legend
plt.text(12, 1.00, '1: Elementary / Primary / Middle School', fontsize = 9, color = 'grey', style = 'normal', weight = 'normal')    
plt.text(12, 0.99, '2: High School Diploma', fontsize = 9, color = 'grey', style = 'normal', weight = 'normal')
plt.text(12, 0.98, '3: Undergraduate / Vocational School', fontsize = 9, color = 'grey', style = 'normal', weight = 'normal')
plt.text(12, 0.97, '4: College (BS, BA) degree or more', fontsize = 9, color = 'grey', style = 'normal', weight = 'normal')
# Set layout
plt.tight_layout()


# Discriminator
kmf_lo = KaplanMeierFitter()
kmf_hi = KaplanMeierFitter()
lo_edu = (survival_data["educ"] == 1)
hi_edu = (survival_data["educ"] > 1)


# Import at risk counts for survival graphs
from lifelines.plotting import add_at_risk_counts


# Define figure
plt.figure(figsize=(9,8))
# Define survival plot
ax = plt.subplot(111)
# Set percent formatter in order to display percent instead of float
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, None,'%'))
# Model data
kmf_lo.fit(T[lo_edu], event_observed=E[lo_edu], label="Elementary/Primary/Middle School")
kmf_lo.plot_survival_function(ax=ax)
kmf_hi.fit(T[hi_edu], event_observed=E[hi_edu], label="High School/Vocational/College (BS, BA or higher)")
kmf_hi.plot_survival_function(ax=ax)
# Add ticks, grid, title and labels
plt.tick_params(labeltop=False, labelright=True, right=True)
plt.xticks(np.arange(0, 25, step=1), fontsize = 10)  # Set label locations.
plt.grid(visible=True, which='both', axis='both', color='#C0C0C0', linestyle='-', linewidth=0.25)
#plt.minorticks_on()
plt.title("Framingham Study (1948 ff.): Survival Condition Higher Education", color = "#4C9900")
plt.xlabel("(Years)")
# Additional Info: license
plt.text(0, 0.55, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# At risk table
add_at_risk_counts(kmf_lo, kmf_hi, ax=ax, fontsize = 6, color = 'grey')
# Set layout
plt.tight_layout()


# Define figure
fig, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(9,5))
fig.suptitle("Survival Curves to Scale")

# Subplot 1:
# Set percent formatter in order to display percent instead of float
ax1.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, None,'%'))
ax1.tick_params(labeltop=False, labelright=True, right=True)
# Plot survival curves without confidence intervals (for clarity)
kmf_lo.plot_survival_function(ax=ax1, ci_show=False, label='lower')
kmf_hi.plot_survival_function(ax=ax1, ci_show=False, label='higher')
# Additional info
ax1.set_title("populated scale", color = "#4C9900")
ax1.set(xlabel='(Years)')
ax1.grid()
# Subplot 2:
# Set percent formatter in order to display percent instead of float
ax2.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, None,'%'))
ax2.tick_params(labeltop=False, labelright=True, right=True)
# Plot survival curves without confidence intervals (for clarity)
kmf_lo.plot_survival_function(ax=ax2, ci_show=False, label='lower')
kmf_hi.plot_survival_function(ax=ax2, ci_show=False, label='higher')
# Set full scale for subplot 2; adjust top to grapically match subplot 2
ax2.set_ylim(bottom=0, top=1.048)
# Additional info
ax2.set_title("full scale", color = "#4C9900")
ax2.set(xlabel='(Years)')
ax2.grid()
# Additional Infos
plt.text(10,0.05, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Set layout
plt.tight_layout()


# Import CSV dataset excerpt
survival_dicho = pd.read_csv("jup_surv_educ_dicho.csv", sep=';')
print(survival_dicho.dtypes)
print(survival_dicho)

TIMEDTH      int64
DEATH        int64
higher     float64
dtype: object
      TIMEDTH  DEATH  higher
0        8766      0     1.0
1        8766      0     1.0
2        8766      0     0.0
3        2956      1     1.0
4        8766      0     1.0
...       ...    ...     ...
4429     6433      1     1.0
4430     6729      1     0.0
4431     8766      0     1.0
4432     8766      0     1.0
4433     8766      0     1.0

[4434 rows x 3 columns]


survival_dicho.isna()


survival_dicho.isna().sum()

TIMEDTH      0
DEATH        0
higher     113
dtype: int64


survival_dicho.count(axis=0, numeric_only=False)

TIMEDTH    4434
DEATH      4434
higher     4321
dtype: int64


print(survival_dicho[survival_dicho.isna().any(axis=1)])
print(survival_dicho[survival_dicho.isna().any(axis=1)].value_counts(subset=['DEATH'], normalize=False))
print(survival_dicho[survival_dicho.isna().any(axis=1)].value_counts(subset=['DEATH'], normalize=True))

      TIMEDTH  DEATH  higher
34       6948      1     NaN
38       8766      0     NaN
74       8766      0     NaN
192      3888      1     NaN
213      8766      0     NaN
...       ...    ...     ...
4269     8766      0     NaN
4285     8766      0     NaN
4310     8766      0     NaN
4311     5581      1     NaN
4327     4788      1     NaN

[113 rows x 3 columns]
DEATH
0        72
1        41
dtype: int64
DEATH
0        0.637168
1        0.362832
dtype: float64


#(0) Original dataset
# Display counts of non-Nan, again (we have already done this in code section [19])
print(survival_dicho.count(axis=0, numeric_only=False))
# Check counts of DEATH vs higher in our original dataset
print(survival_dicho.value_counts(subset=['DEATH','higher'], normalize=False))

TIMEDTH    4434
DEATH      4434
higher     4321
dtype: int64
DEATH  higher
0      1.0       1770
       0.0       1042
1      0.0        780
       1.0        729
dtype: int64


#(1) Lower limit case for higher education
survival_limit_0 = survival_dicho.fillna(value=0)
# Check counts of non-NaN
print(survival_limit_0.count (axis=0))
# Check counts of DEATH vs higher in modified dataset
print(survival_limit_0.value_counts(subset=['DEATH','higher'], normalize=False))

TIMEDTH    4434
DEATH      4434
higher     4434
dtype: int64
DEATH  higher
0      1.0       1770
       0.0       1114
1      0.0        821
       1.0        729
dtype: int64


#(2) Upper limit case for higher education
survival_limit_1 = survival_dicho.fillna(value=1)
# Check counts of non-NaN
print(survival_limit_1.count(axis=0))
# Check counts of DEATH vs higher in modified dataset
print(survival_limit_1.value_counts(subset=['DEATH','higher'], normalize=False))

TIMEDTH    4434
DEATH      4434
higher     4434
dtype: int64
DEATH  higher
0      1.0       1842
       0.0       1042
1      0.0        780
       1.0        770
dtype: int64


#(3) Fill NaN for the 41 fatalities with "1", and for the 72 survivors with "0"
# Make copy of DataFrame
survival_death_1=survival_dicho.copy(deep=True)
# Fill NaNs according to condition
survival_death_1.loc[survival_death_1.DEATH.eq(1) & survival_death_1.higher.isna(), 'higher'] = 1
survival_death_1.loc[survival_death_1.DEATH.eq(0) & survival_death_1.higher.isna(), 'higher'] = 0
# Check counts of non-NaN
print(survival_death_1.count(axis=0))
# Check counts of DEATH vs higher in modified dataset
print(survival_death_1.value_counts(subset=['DEATH','higher'], normalize=False))

TIMEDTH    4434
DEATH      4434
higher     4434
dtype: int64
DEATH  higher
0      1.0       1770
       0.0       1114
1      0.0        780
       1.0        770
dtype: int64


#(4) Fill NaN for the 41 fatalities with "0", and for the 72 survivors with "1"
# Make copy of DataFrame
survival_death_0=survival_dicho.copy(deep=True)
# Fill NaNs according to condition
survival_death_0.loc[survival_death_0.DEATH.eq(1) & survival_death_0.higher.isna(), 'higher'] = 0
survival_death_0.loc[survival_death_0.DEATH.eq(0) & survival_death_0.higher.isna(), 'higher'] = 1
# Check counts of non-NaN
print(survival_death_0.count(axis=0))
# Check counts of DEATH vs higher in modified dataset
print(survival_death_0.value_counts(subset=['DEATH','higher'], normalize=False))

TIMEDTH    4434
DEATH      4434
higher     4434
dtype: int64
DEATH  higher
0      1.0       1842
       0.0       1042
1      0.0        821
       1.0        729
dtype: int64


#(5) Complete cases 
survival_complete = survival_dicho[survival_dicho['higher'].notna()]
print(survival_complete.count(axis=0))
# Check counts of DEATH vs higher in modified dataset
print(survival_complete.value_counts(subset=['DEATH','higher'], normalize=False))

TIMEDTH    4321
DEATH      4321
higher     4321
dtype: int64
DEATH  higher
0      1.0       1770
       0.0       1042
1      0.0        780
       1.0        729
dtype: int64


from lifelines import CoxPHFitter
cph = CoxPHFitter()


cph.fit(survival_limit_1, 'TIMEDTH', 'DEATH')
cph.print_summary(model="untransformed variables", decimals=3)


cph.fit(survival_limit_0, 'TIMEDTH', 'DEATH')
cph.print_summary(model="untransformed variables", decimals=3)


cph.fit(survival_death_1, 'TIMEDTH', 'DEATH')
cph.print_summary(model="untransformed variables", decimals=3)


cph.fit(survival_death_0, 'TIMEDTH', 'DEATH')
cph.print_summary(model="untransformed variables", decimals=3)


cph.fit(survival_complete, 'TIMEDTH', 'DEATH')
cph.print_summary(model="untransformed variables", decimals=3)


cph.check_assumptions(survival_complete, p_value_threshold=0.05, show_plots=True)

Proportional hazard assumption looks okay.

[]


# Import log rank test statistics
from lifelines.statistics import logrank_test
# Prepare some variables for the test
durations = pd.DataFrame(survival_complete['TIMEDTH'])
event_observed = pd.DataFrame(survival_complete['DEATH'])
# Perform Log Rank Test
lrt = logrank_test(durations, event_observed) 
lrt.print_summary()


# Import methods
from lifelines import NelsonAalenFitter
# Define figure
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(10,5))
fig.suptitle("Nelson-Aalen Hazard Plot (lower/higher education)")
# Perform modeling
naf_lo = NelsonAalenFitter(nelson_aalen_smoothing=False)
naf_hi = NelsonAalenFitter(nelson_aalen_smoothing=False)
naf_lo.fit(T[lo_edu], event_observed=E[lo_edu], label="lower")
naf_hi.fit(T[hi_edu], event_observed=E[hi_edu], label="higher")
# Plot 
naf_lo.plot_hazard(ax=ax1, bandwidth=0.1)
naf_hi.plot_hazard(ax=ax1, bandwidth=0.1)
ax1.set_title("minimal smoothing", color = "#4C9900")
ax1.set(xlabel='(Years)')
naf_lo.plot_hazard(ax=ax2, bandwidth=0.5)
naf_hi.plot_hazard(ax=ax2, bandwidth=0.5)
ax2.set_title("moderate smoothing", color = "#4C9900")
ax2.set(xlabel='(Years)')
naf_lo.plot_hazard(ax=ax3, bandwidth=3)
naf_hi.plot_hazard(ax=ax3, bandwidth=3)
ax3.set_title("heavy smoothing", color = "#4C9900")
ax3.set(xlabel='(Years)')
# Additional Infos
plt.text(0, 0.13, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Set layout
plt.tight_layout()

Proportional hazard assumption looks okay.

[]

Scaled Schoenfeld residuals computation result:
covariate    higher
1284      -1.836503
2394      -1.837050
2042      -1.837596
1876      -1.838143
2070      -1.838691
...             ...
1431      -2.047287
2176       1.961665
3546      -2.047581
1956      -2.048540
1858      -2.049500

[1509 rows x 1 columns]

Scaled Schoenfeld residuals sorted by index:
covariate    higher
3          2.134646
13         2.077670
14         2.038455
15         2.167767
17         2.156165
...             ...
4426      -1.838839
4427      -1.894075
4428       1.990486
4429       2.036925
4430      -1.978273

[1509 rows x 1 columns]

Participants with DEATH event:
      TIMEDTH  DEATH  higher
3        2956      1     1.0
13       5592      1     1.0
14       6411      1     1.0
15        146      1     1.0
17       1442      1     1.0
...       ...    ...     ...
4426      565      1     0.0
4427     4300      1     0.0
4428     7746      1     1.0
4429     6433      1     1.0
4430     6729      1     0.0

[1509 rows x 3 columns]

      TIMEDTH  DEATH  higher_x  higher_y
3        2956      1       1.0  2.134646
13       5592      1       1.0  2.077670
14       6411      1       1.0  2.038455
15        146      1       1.0  2.167767
17       1442      1       1.0  2.156165
...       ...    ...       ...       ...
4426      565      1       0.0 -1.838839
4427     4300      1       0.0 -1.894075
4428     7746      1       1.0  1.990486
4429     6433      1       1.0  2.036925
4430     6729      1       0.0 -1.978273

[1509 rows x 4 columns]

      TIMEDTH  DEATH  higher_x  higher_y
1284       26      1       0.0 -1.836503
2394       34      1       0.0 -1.837050
2042       40      1       0.0 -1.837596
1876       45      1       0.0 -1.838143
2070       46      1       0.0 -1.838691
...       ...    ...       ...       ...
2176     8744      1       1.0  1.961665
1431     8744      1       0.0 -2.047287
3546     8747      1       0.0 -2.047581
1956     8753      1       0.0 -2.048540
1858     8759      1       0.0 -2.049500

[1509 rows x 4 columns]

   Bootstrapping lowess lines. May take a moment...

Proportional hazard assumption looks okay.

[[<Axes: xlabel='rank-transformed time\n(p=0.9539)'>,
  <Axes: xlabel='km-transformed time\n(p=0.9540)'>]]

  name enter_TIME sunburn sunburn_TIME dropout dropout_TIME lost lost_TIME
0  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
1  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
2  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
3  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
4  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
5  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
6  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN


# Import methods
from lifelines import NelsonAalenFitter
# Define figure
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=True, figsize=(10,5))
fig.suptitle("Nelson-Aalen Hazard Plot (lower/higher education)")
# Perform modeling
naf_lo = NelsonAalenFitter(nelson_aalen_smoothing=False)
naf_hi = NelsonAalenFitter(nelson_aalen_smoothing=False)
naf_lo.fit(T[lo_edu], event_observed=E[lo_edu], label="lower")
naf_hi.fit(T[hi_edu], event_observed=E[hi_edu], label="higher")
# Plot 
naf_lo.plot_hazard(ax=ax1, bandwidth=0.1)
naf_hi.plot_hazard(ax=ax1, bandwidth=0.1)
ax1.set_title("minimal smoothing", color = "#4C9900")
ax1.set(xlabel='(Years)')
naf_lo.plot_hazard(ax=ax2, bandwidth=0.5)
naf_hi.plot_hazard(ax=ax2, bandwidth=0.5)
ax2.set_title("moderate smoothing", color = "#4C9900")
ax2.set(xlabel='(Years)')
naf_lo.plot_hazard(ax=ax3, bandwidth=3)
naf_hi.plot_hazard(ax=ax3, bandwidth=3)
ax3.set_title("heavy smoothing", color = "#4C9900")
ax3.set(xlabel='(Years)')
# Additional Infos
plt.text(0, 0.13, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Set layout
plt.tight_layout()


# We do not have to import the methods, neither do we have to run the 
# fitters, because both of this has been done in previous cells.
# Define figure:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, sharey=False, figsize=(10,5))
fig.suptitle("Framingham Study (1948 ff.): Mortality Over Time by Educational Status")
# Plot Nelson-Aalen hazard (again)
naf_lo.plot_hazard(ax=ax1, bandwidth=0.5, ci_show=False)
naf_hi.plot_hazard(ax=ax1, bandwidth=0.5, ci_show=False)
ax1.set_title("Instantaneous Hazard", color = "#4C9900")
ax1.set(xlabel='(Years)')
ax1.text(8.7,0.047, '(smoothed)', fontsize = 9, color = "#4C9900", style = 'normal')
# Plot Nelson-Aalen cumulative hazard 
naf_lo.plot_cumulative_hazard(ax=ax2, ci_show=False)
naf_hi.plot_cumulative_hazard(ax=ax2, ci_show=False)
ax2.set_title("Cumulative Hazard", color = "#4C9900")
ax2.set(xlabel='(Years)')
# Plot Kaplan Meier for comparison
kmf_lo.plot_survival_function(ax=ax3, ci_show=False, label='lower')
kmf_hi.plot_survival_function(ax=ax3, ci_show=False, label='higher')
ax3.set(xlabel='(Years)')
ax3.set_title("Survival", color = "#4C9900")
# Additional Infos
plt.text(0,0.57, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Set layout
plt.tight_layout()


cph.check_assumptions(survival_complete, p_value_threshold=0.05, show_plots=True)

Proportional hazard assumption looks okay.

[]

Scaled Schoenfeld residuals computation result:
covariate    higher
1284      -1.836503
2394      -1.837050
2042      -1.837596
1876      -1.838143
2070      -1.838691
...             ...
1431      -2.047287
2176       1.961665
3546      -2.047581
1956      -2.048540
1858      -2.049500

[1509 rows x 1 columns]

Scaled Schoenfeld residuals sorted by index:
covariate    higher
3          2.134646
13         2.077670
14         2.038455
15         2.167767
17         2.156165
...             ...
4426      -1.838839
4427      -1.894075
4428       1.990486
4429       2.036925
4430      -1.978273

[1509 rows x 1 columns]

Participants with DEATH event:
      TIMEDTH  DEATH  higher
3        2956      1     1.0
13       5592      1     1.0
14       6411      1     1.0
15        146      1     1.0
17       1442      1     1.0
...       ...    ...     ...
4426      565      1     0.0
4427     4300      1     0.0
4428     7746      1     1.0
4429     6433      1     1.0
4430     6729      1     0.0

[1509 rows x 3 columns]

      TIMEDTH  DEATH  higher_x  higher_y
3        2956      1       1.0  2.134646
13       5592      1       1.0  2.077670
14       6411      1       1.0  2.038455
15        146      1       1.0  2.167767
17       1442      1       1.0  2.156165
...       ...    ...       ...       ...
4426      565      1       0.0 -1.838839
4427     4300      1       0.0 -1.894075
4428     7746      1       1.0  1.990486
4429     6433      1       1.0  2.036925
4430     6729      1       0.0 -1.978273

[1509 rows x 4 columns]

      TIMEDTH  DEATH  higher_x  higher_y
1284       26      1       0.0 -1.836503
2394       34      1       0.0 -1.837050
2042       40      1       0.0 -1.837596
1876       45      1       0.0 -1.838143
2070       46      1       0.0 -1.838691
...       ...    ...       ...       ...
2176     8744      1       1.0  1.961665
1431     8744      1       0.0 -2.047287
3546     8747      1       0.0 -2.047581
1956     8753      1       0.0 -2.048540
1858     8759      1       0.0 -2.049500

[1509 rows x 4 columns]

   Bootstrapping lowess lines. May take a moment...

Proportional hazard assumption looks okay.

[[<Axes: xlabel='rank-transformed time\n(p=0.9539)'>,
  <Axes: xlabel='km-transformed time\n(p=0.9540)'>]]

  name enter_TIME sunburn sunburn_TIME dropout dropout_TIME lost lost_TIME
0  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
1  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
2  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
3  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
4  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
5  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
6  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN


cph.check_assumptions(survival_complete, p_value_threshold=0.05, show_plots=True)

Proportional hazard assumption looks okay.

[]


from lifelines.statistics import proportional_hazard_test
for transform_spec in ['identity', 'log', 'rank', 'km']:
    results = proportional_hazard_test(cph, survival_complete, time_transform=transform_spec)
    results.print_summary(decimals=3, model="untransformed variables")


# Rerun CoxPHfitter
cph.fit(survival_complete, 'TIMEDTH', 'DEATH')
# Prepare scaled Schoenfeld residuals
scaled_schoenfeld_residuals = cph.compute_residuals(survival_complete, kind='scaled_schoenfeld')
print("\033[1mScaled Schoenfeld residuals computation result:\033[0m")
print(scaled_schoenfeld_residuals)

Scaled Schoenfeld residuals computation result:
covariate    higher
1284      -1.836503
2394      -1.837050
2042      -1.837596
1876      -1.838143
2070      -1.838691
...             ...
1431      -2.047287
2176       1.961665
3546      -2.047581
1956      -2.048540
1858      -2.049500

[1509 rows x 1 columns]


# Sort residuals by index for later comparison with original dataset
schoenfeld_sorted=scaled_schoenfeld_residuals.sort_index()
print("\033[1mScaled Schoenfeld residuals sorted by index:\033[0m")
print(schoenfeld_sorted)

Scaled Schoenfeld residuals sorted by index:
covariate    higher
3          2.134646
13         2.077670
14         2.038455
15         2.167767
17         2.156165
...             ...
4426      -1.838839
4427      -1.894075
4428       1.990486
4429       2.036925
4430      -1.978273

[1509 rows x 1 columns]


# Subset: participants with event
survival_DEATH=survival_complete.loc[survival_complete.DEATH.eq(1)]
print("\033[1mParticipants with DEATH event:\033[0m")
print(survival_DEATH)

Participants with DEATH event:
      TIMEDTH  DEATH  higher
3        2956      1     1.0
13       5592      1     1.0
14       6411      1     1.0
15        146      1     1.0
17       1442      1     1.0
...       ...    ...     ...
4426      565      1     0.0
4427     4300      1     0.0
4428     7746      1     1.0
4429     6433      1     1.0
4430     6729      1     0.0

[1509 rows x 3 columns]


merged_Sr_sD = survival_DEATH.copy('deep')
merged_Sr_sD = merged_Sr_sD.merge(schoenfeld_sorted, left_index=True, right_index=True)
print(merged_Sr_sD)

      TIMEDTH  DEATH  higher_x  higher_y
3        2956      1       1.0  2.134646
13       5592      1       1.0  2.077670
14       6411      1       1.0  2.038455
15        146      1       1.0  2.167767
17       1442      1       1.0  2.156165
...       ...    ...       ...       ...
4426      565      1       0.0 -1.838839
4427     4300      1       0.0 -1.894075
4428     7746      1       1.0  1.990486
4429     6433      1       1.0  2.036925
4430     6729      1       0.0 -1.978273

[1509 rows x 4 columns]


merged_Sr_sD = merged_Sr_sD.sort_values(by=['TIMEDTH'])
print(merged_Sr_sD)

      TIMEDTH  DEATH  higher_x  higher_y
1284       26      1       0.0 -1.836503
2394       34      1       0.0 -1.837050
2042       40      1       0.0 -1.837596
1876       45      1       0.0 -1.838143
2070       46      1       0.0 -1.838691
...       ...    ...       ...       ...
2176     8744      1       1.0  1.961665
1431     8744      1       0.0 -2.047287
3546     8747      1       0.0 -2.047581
1956     8753      1       0.0 -2.048540
1858     8759      1       0.0 -2.049500

[1509 rows x 4 columns]


# Import LOWESS lines
from statsmodels.nonparametric.smoothers_lowess import lowess
# Prepare two datasets with different amounts of smoothing
schoenfeld_smooth = lowess(merged_Sr_sD['higher_y'], merged_Sr_sD['TIMEDTH'], is_sorted=False, frac=0.75, it=0)
schoenfeld_coarse = lowess(merged_Sr_sD['higher_y'], merged_Sr_sD['TIMEDTH'], is_sorted=False, frac=0.01, it=0)


# Define figure
plt.figure(figsize=(10,5))
# PLOT RAW RESIDUALS AS SCATTERPLOT:
plt.plot(merged_Sr_sD['TIMEDTH'], merged_Sr_sD['higher_y'], 'b.', markersize=5)
# Plot LOWESS lines
plt.plot(schoenfeld_smooth[:,0], schoenfeld_smooth[:,1], 'r', linewidth=3)
plt.plot(schoenfeld_coarse[:,0], schoenfeld_coarse[:,1], 'g', linewidth=0.2)
# Plot additional info
plt.xlabel("Untransformed Time (days)", fontsize=12, color='black')#
plt.ylabel("Scaled Schoenfeld-Residuals", fontsize=12)
plt. title("Scaled Schoenfeld-Residuals", fontsize=16, color='black')
# Additional Infos
plt.text(0,-2.3, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
plt.ylim([-2.5,2.5])
plt.grid(visible=True, which='both', axis='y', color='darkgrey', linestyle='-', linewidth=0.25)


# Define figure
plt.figure(figsize=(10,5))
#PLOT RAW RESIDUALS AS LINES
plt.plot(merged_Sr_sD['TIMEDTH'], merged_Sr_sD['higher_y'], 'b', linewidth=0.1)
# Plot LOWESS lines
plt.plot(schoenfeld_smooth[:,0], schoenfeld_smooth[:,1], 'r', linewidth=3)
plt.plot(schoenfeld_coarse[:,0], schoenfeld_coarse[:,1], 'g', linewidth=0.5)
# Plot additional info
plt.xlabel("Untransformed Time (days)", fontsize=12, color='black')#
plt.ylabel("Scaled Schoenfeld-Residuals", fontsize=12)
plt. title("Scaled Schoenfeld-Residuals", fontsize=16, color='black')
# Additional Infos
plt.text(0,-2.3, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
plt.ylim([-2.5,2.5])
plt.grid(visible=True, which='both', axis='y', color='darkgrey', linestyle='-', linewidth=0.25)


# This cell contains the code snippet from mixins.py in lifelines that defines
# the check_assumptions function, as modified according to my pull request.
# Code is truncated to the section necessery for running the function within this notebook.
# Imports that we have already done, are commented out. Some imports are redundant, 
# as the lifelines code base uses different wording, that has been retained for compatability.

# I have renamed the function to `check_assumptions_forceplot` for the purpose
# of this code cell in our notebook, in order to avoid conflicts with the original function
# from the lifelines project.

# (...)
from typing import List, Optional, Dict, Any, Iterable
from textwrap import dedent, fill
#from autograd import numpy as anp
#import numpy as np
from pandas import DataFrame, Series
from lifelines.statistics import proportional_hazard_test, TimeTransformers
from lifelines.utils import format_p_value
from lifelines.utils.lowess import lowess

class ProportionalHazardMixin:
    def check_assumptions_forceplot(
        self,
        training_df: DataFrame,
        advice: bool = True,
        show_plots: bool = False,
        p_value_threshold: float = 0.01,
        plot_n_bootstraps: int = 15,
        columns: Optional[List[str]] = None,
    ) -> None:
        """
        Use this function to test the proportional hazards assumption. See usage example at
        https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html
        Parameters
        -----------
        training_df: DataFrame
            the original DataFrame used in the call to ``fit(...)`` or a sub-sampled version.
        advice: bool, optional
            display advice as output to the user's screen
        show_plots: bool, optional
            display plots of the scaled Schoenfeld residuals and loess curves. This is an eyeball test for violations.
            This will slow down the function significantly.
        p_value_threshold: float, optional
            the threshold to use to alert the user of violations. See note below.
        plot_n_bootstraps:
            in the plots displayed, also display plot_n_bootstraps bootstrapped loess curves. This will slow down
            the function significantly.
        columns: list, optional
            specify a subset of columns to test.
        Returns
        --------
            A list of list of axes objects.
        Examples
        ----------
        .. code:: python
            from lifelines.datasets import load_rossi
            from lifelines import CoxPHFitter
            rossi = load_rossi()
            cph = CoxPHFitter().fit(rossi, 'week', 'arrest')
            axes = cph.check_assumptions(rossi, show_plots=True)
        Notes
        -------
        The ``p_value_threshold`` is arbitrarily set at 0.01. Under the null, some covariates
        will be below the threshold (i.e. by chance). This is compounded when there are many covariates.
        Similarly, when there are lots of observations, even minor deviances from the proportional hazard
        assumption will be flagged.
        With that in mind, it's best to use a combination of statistical tests and eyeball tests to
        determine the most serious violations.
        References
        -----------
        section 5 in https://socialsciences.mcmaster.ca/jfox/Books/Companion/appendices/Appendix-Cox-Regression.pdf,
        http://www.mwsug.org/proceedings/2006/stats/MWSUG-2006-SD08.pdf,
        http://eprints.lse.ac.uk/84988/1/06_ParkHendry2015-ReassessingSchoenfeldTests_Final.pdf
        """

        if not training_df.index.is_unique:
            raise IndexError(
                "`training_df` index should be unique for this exercise. Please make it unique or use `.reset_index(drop=True)` to force a unique index"
            )

        residuals = self.compute_residuals(training_df, kind="scaled_schoenfeld")
        test_results = proportional_hazard_test(self, training_df, time_transform=["rank", "km"], precomputed_residuals=residuals)

        residuals_and_duration = residuals.join(training_df[self.duration_col])
        Xs = self.regressors.transform_df(training_df)

        counter = 0
        n = residuals_and_duration.shape[0]
        axes = []

        for variable in self.params_.index.intersection(columns or self.params_.index):
            minumum_observed_p_value = test_results.summary.loc[variable, "p"].min()
            
            # Repositioned plotting so that it is not conditional on violation of the PH assumption
            if show_plots:
                axes.append([])
                print()
                print("   Bootstrapping lowess lines. May take a moment...")
                print()
                from matplotlib import pyplot as plt

                fig = plt.figure()

                # plot variable against all time transformations.
                for i, (transform_name, transformer) in enumerate(TimeTransformers().iter(["rank", "km"]), start=1):
                    p_value = test_results.summary.loc[(variable, transform_name), "p"]

                    ax = fig.add_subplot(1, 2, i)

                    y = residuals_and_duration[variable]
                    tt = transformer(self.durations, self.event_observed, self.weights)[self.event_observed.values]

                    ax.scatter(tt, y, alpha=0.75)

                    y_lowess = lowess(tt.values, y.values)
                    ax.plot(tt, y_lowess, color="k", alpha=1.0, linewidth=2)

                    # bootstrap some possible other lowess lines. This is an approximation of the 100% confidence intervals
                    for _ in range(plot_n_bootstraps):
                        ix = sorted(np.random.choice(n, n))
                        tt_ = tt.values[ix]
                        y_lowess = lowess(tt_, y.values[ix])
                        ax.plot(tt_, y_lowess, color="k", alpha=0.30)

                    best_xlim = ax.get_xlim()
                    ax.hlines(0, 0, tt.max(), linestyles="dashed", linewidths=1)
                    ax.set_xlim(best_xlim)

                    ax.set_xlabel("%s-transformed time\n(p=%.4f)" % (transform_name, p_value), fontsize=10)
                    axes[-1].append(ax)

                fig.suptitle("Scaled Schoenfeld residuals of '%s'" % variable, fontsize=14)
                plt.tight_layout()
                plt.subplots_adjust(top=0.90)

            if np.round(minumum_observed_p_value, 2) > p_value_threshold:

                continue

            counter += 1

            if counter == 1:
                if advice:
                    print(
                        fill(
                            """The ``p_value_threshold`` is set at %g. Even under the null hypothesis of no violations, some covariates will be below the threshold by chance. This is compounded when there are many covariates. Similarly, when there are lots of observations, even minor deviances from the proportional hazard assumption will be flagged."""
                            % p_value_threshold,
                            width=100,
                        )
                    )
                    print()
                    print(
                        fill(
                            """With that in mind, it's best to use a combination of statistical tests and visual tests to determine the most serious violations. Produce visual plots using ``check_assumptions(..., show_plots=True)`` and looking for non-constant lines. See link [A] below for a full example.""",
                            width=100,
                        )
                    )
                    print()
                test_results.print_summary()
                print()

            print()
            print(
                "%d. Variable '%s' failed the non-proportional test: p-value is %s."
                % (counter, variable, format_p_value(4)(minumum_observed_p_value)),
                end="\n\n",
            )

            if advice:
                values = Xs["beta_"][variable]
                value_counts = values.value_counts()
                n_uniques = value_counts.shape[0]

                # Arbitrary chosen to check for ability to use strata col.
                # This should capture dichotomous / low cardinality values.
                if n_uniques <= 6 and value_counts.min() >= 5:
                    print(
                        fill(
                            "   Advice: with so few unique values (only {0}), you can include `strata=['{1}', ...]` in the call in `.fit`. See documentation in link [E] below.".format(
                                n_uniques, variable
                            ),
                            width=100,
                        )
                    )
                else:
                    print(
                        fill(
                            """   Advice 1: the functional form of the variable '{var}' might be incorrect. That is, there may be non-linear terms missing. The proportional hazard test used is very sensitive to incorrect functional forms. See documentation in link [D] below on how to specify a functional form.""".format(
                                var=variable
                            ),
                            width=100,
                        ),
                        end="\n\n",
                    )
                    print(
                        fill(
                            """   Advice 2: try binning the variable '{var}' using pd.cut, and then specify it in `strata=['{var}', ...]` in the call in `.fit`. See documentation in link [B] below.""".format(
                                var=variable
                            ),
                            width=100,
                        ),
                        end="\n\n",
                    )
                    print(
                        fill(
                            """   Advice 3: try adding an interaction term with your time variable. See documentation in link [C] below.""",
                            width=100,
                        ),
                        end="\n\n",
                    )

        if advice and counter > 0:
            print(
                dedent(
                    r"""
                ---
                [A]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html
                [B]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Bin-variable-and-stratify-on-it
                [C]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Introduce-time-varying-covariates
                [D]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Modify-the-functional-form
                [E]  https://lifelines.readthedocs.io/en/latest/jupyter_notebooks/Proportional%20hazard%20assumption.html#Stratification
            """
                )
            )

        if counter == 0:
            print("Proportional hazard assumption looks okay.")
        return axes
# (...)


ProportionalHazardMixin.check_assumptions_forceplot(cph, training_df=survival_complete, p_value_threshold=0.05, show_plots=True)

   Bootstrapping lowess lines. May take a moment...

Proportional hazard assumption looks okay.

[[<Axes: xlabel='rank-transformed time\n(p=0.9539)'>,
  <Axes: xlabel='km-transformed time\n(p=0.9540)'>]]


# Prepare an empty pandas dataframe
sunburn_study = pd.DataFrame(columns=['name','enter_TIME','sunburn','sunburn_TIME','dropout','dropout_TIME','lost','lost_TIME']
                             , index=[0,1,2,3,4,5,6])
print(sunburn_study)

  name enter_TIME sunburn sunburn_TIME dropout dropout_TIME lost lost_TIME
0  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
1  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
2  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
3  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
4  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
5  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN
6  NaN        NaN     NaN          NaN     NaN          NaN  NaN       NaN


# Populate dataframe with data
# For readability, we will fist construct one-dimensional series
name = np.array         (['Jack', 'Jill', 'Joe', 'Jane', 'Huey', 'Dewey', 'Louie'])
enter_TIME = np.array   ([   10,     11,    11,     10,     12,      10,       8 ])
sunburn = np.array      ([    1,      0,     1,      1,      0,       1,       1 ])
sunburn_TIME = np.array ([   12,      0,    17,     13,      0,      14,      12 ])
dropout = np.array      ([    0,      0,     0,      1,      1,       0,       0 ])
dropout_TIME = np.array ([    0,      0,     0,     14,     13,       0,       0 ])
lost = np.array         ([    0,      0,     0,      0,      0,       1,       0 ])
lost_TIME = np.array    ([    0,      0,     0,      0,      0,  np.nan,       0 ])
# Now we will write the data from our series to the dataframe
for i in range(7):
    sunburn_study.loc[i] = pd.Series({'name':name[i], 'enter_TIME':enter_TIME[i], 
                                      'sunburn':sunburn[i],'sunburn_TIME':sunburn_TIME[i], 'dropout':dropout[i], 
                                      'dropout_TIME':dropout_TIME[i], 'lost':lost[i], 'lost_TIME':lost_TIME[i]})
print(sunburn_study)

    name enter_TIME sunburn sunburn_TIME dropout dropout_TIME lost lost_TIME
0   Jack         10       1           12       0            0    0       0.0
1   Jill         11       0            0       0            0    0       0.0
2    Joe         11       1           17       0            0    0       0.0
3   Jane         10       1           13       1           14    0       0.0
4   Huey         12       0            0       1           13    0       0.0
5  Dewey         10       1           14       0            0    1       NaN
6  Louie          8       1           12       0            0    0       0.0


# Define figure
fig, ax = plt.subplots(figsize=(10,5))
y_pos = np.arange(len(sunburn_study['name']))
ax.set_yticks(y_pos, labels=sunburn_study['name'], fontsize=14)

plt.title("Sunburn Study: Raw Data Visualization", fontsize=16,color="#4C9900")
plt.xlim([7,18])
plt.ylim([6.5,-0.5])
plt.grid(visible=True, which='both', axis='y', color='darkgrey', linestyle='-', linewidth=1)
plt.axvline(x=10, color='b', linestyle=':')
plt.axvline(x=15, color='b', linestyle=':')
# Additional Infos
plt.text(15.4, 6.3, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')

# For such a simple plot, a matrix of explicit coordinates, symbols and colors
# would have been more efficient, however this is an example, that could easily be reused
# for larger datasets.
for i in range(7):
    eT = sunburn_study.loc[i,'enter_TIME']
    sT = sunburn_study.loc[i,'sunburn_TIME']
    dT = sunburn_study.loc[i,'dropout_TIME'] 
    lT = sunburn_study.loc[i,'lost_TIME']
    if eT < 10:
        dot_color = 'darkgrey'
    else:
        dot_color = 'green'
    plt.plot(eT, i, '>', color=dot_color, markersize=15)
    if sunburn_study.loc[i,'sunburn']==1:
        if sT > 15:
            dot_color = 'darkgrey'
        elif (sT > dT) & (sunburn_study.loc[i,'dropout']==1):
            dot_color = 'darkgrey' 
        elif (sT > lT) & (sunburn_study.loc[i,'lost']==1):
            dot_color = 'darkgrey' 
        elif sunburn_study.loc[i,'lost']==1:
            if np.isnan(lT)==True:
                dot_color = 'darkgrey'  
        else:
            dot_color = 'red'
        plt.plot(sT, i, '.', color=dot_color, markersize=25)
    if sunburn_study.loc[i,'dropout']==1:
        if sunburn_study.loc[i,'sunburn']==1 & (sT<dT):
           dot_color='darkgrey'
        else:
            dot_color='blue'
        plt.plot(dT, i, '^', color=dot_color, markersize=15)
    if sunburn_study.loc[i,'lost']==1:
        plt.plot(lT, i, 'x', color='blue', markersize=15)
    if sunburn_study.loc[i,'dropout']==0: 
        if sunburn_study.loc[i,'sunburn']==0: 
            plt.plot(15, i, '<', color='blue', markersize=15)
    if sunburn_study.loc[i,'sunburn']==1:
        if sT > 15:
            plt.plot(15, i, '<', color='blue', markersize=15)


# Dataframe
sunburn_survival = pd.DataFrame(columns=['TIME','EVENT'], index=[0,1,2,3,4,5,6])
# We will q & d populate the array for our TIME variable by hand
T = np.array   ([  2,    4,    4,    3,    1,    np.nan, 2 ])
# We will q & populate the array for our EVENT variable by hand
E = np.array   ([  1,    0,    0,    1,    0,    np.nan, 1 ]) 
for i in range(7):
    sunburn_survival.loc[i] = pd.Series({'TIME':T[i], 'EVENT':E[i]})
print(sunburn_survival)

  TIME EVENT
0  2.0   1.0
1  4.0   0.0
2  4.0   0.0
3  3.0   1.0
4  1.0   0.0
5  NaN   NaN
6  2.0   1.0


# Define figure
fig, ax = plt.subplots(figsize=(10,5))
y_pos = np.arange(len(sunburn_study['name']))
ax.set_yticks(y_pos, labels=sunburn_study['name'], fontsize=14)

plt.title("Sunburn Study: Time to Event Visualization", fontsize=16,color="#4C9900")
plt.xlim([-1,6])
plt.ylim([6.5,-0.5])
plt.grid(visible=True, which='both', axis='y', color='darkgrey', linestyle='-', linewidth=1)
plt.axvline(x=0, color='b', linestyle=':')
plt.axvline(x=5, color='b', linestyle=':')
# Additional Infos
plt.text(4.3, 6.3, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')

for i in range(7):
    if pd.isna(sunburn_survival.loc[i,'TIME']) == False:
        if pd.isna(sunburn_survival.loc[i,'EVENT']) == False:
            plt.plot(0, i, '>', color='g', markersize=15)
            if sunburn_survival.loc[i,'EVENT'] ==1:
                plt.plot(sunburn_survival.loc[i,'TIME'], i, '.', color='r', markersize=25)
            else:
                plt.plot(sunburn_survival.loc[i,'TIME'], i, '.', color='b', markersize=25)


sunburn_cleaned = sunburn_survival.dropna(axis=0)
print (sunburn_cleaned)

  TIME EVENT
0  2.0   1.0
1  4.0   0.0
2  4.0   0.0
3  3.0   1.0
4  1.0   0.0
6  2.0   1.0


# We will populate the array for our TIME variable from the cleansed dataframe
Tc = np.array (sunburn_cleaned['TIME'], dtype=float)
# We will q & populate the array for our EVENT variable from the cleansed dataframe
Ec = np.array (sunburn_cleaned['EVENT'], dtype=float)
print(Tc)
print(Ec)

[2. 4. 4. 3. 1. 2.]
[1. 0. 0. 1. 0. 1.]


# Define figure
plt.figure(figsize=(8,6))
ax = plt.subplot(111)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1.0, None,'%'))
# Fit and plot model
kmf.fit(Tc, event_observed=Ec)
kmf.plot_survival_function(ax=ax, ci_show=False, label='remaining without sunburn')
# Additional information
plt.tick_params(labeltop=False, labelright=True, right=True)
plt.xticks(np.arange(0, 5, step=1), fontsize = 10) 
plt.grid(visible=True, which='both', axis='both', color='#C0C0C0', linestyle='-', linewidth=0.25)
plt.title("Silly Sunburn Study", color = "#4C9900")
plt.xlabel("(Hours)")
ax.set_ylim(bottom=0, top=1.1)
# Additional Infos
plt.text(0, 0.05, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Add at-rsik table
add_at_risk_counts(kmf, labels='#', rows_to_show=['At risk','Events','Censored'], ax=ax, xticks=[0,1,2,3,4], fontsize = 12, color = 'grey')
plt.tight_layout()


# Define Figure
plt.figure(figsize=(7,7))
plt.title("Parametric vs. Nonparametric Fitting", color = "#4C9900")
plt.text(15, 0.92, '(CIs shown only for Weibull)', fontsize = 10, color = 'grey', style = 'normal')
plt.text(0, 0.55, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Choose time variable from complete cases dataset
Tc = survival_complete["TIMEDTH"]/365
# Choose event variable from complete cases dataset
Ec = survival_complete["DEATH"]

# Plot nonparametric Kaplan-Meier fit
value_range = [0, 1]
color_range_kmf = ['g','b']
for value in value_range:
    disc = (survival_complete["higher"] == value)
    kmf_label = ("Kaplan-Meier: higher=" + str(value))
    kmf.fit(Tc[disc], event_observed=Ec[disc], label=kmf_label)
    kmf.plot_survival_function(ci_show=False, color=color_range_kmf[value])

# Plot parametric Weibull fit
from lifelines import WeibullFitter
wbf = WeibullFitter()
value_range = [0, 1]
print("WEIBULL MODEL SUMMARIES:")
color_range_llf = ['#ff80ff','#ff6600']
for value in value_range:
    disc = (survival_complete["higher"] == value)
    wbf_label = ("Weibull: higher="+str(value))
    wbf.fit(Tc[disc], event_observed=Ec[disc], label=(wbf_label))
    wbf.plot_survival_function(ci_show=True, color=color_range_llf[value])
    print("Covariate 'higher' = ", value)
    wbf.print_summary()
plt.xlabel("(Years)")

WEIBULL MODEL SUMMARIES:
Covariate 'higher' =  0

Covariate 'higher' =  1

Text(0.5, 0, '(Years)')

PIECEWISE MODEL SUMMARIES:
Covariate 'higher' =  0

Covariate 'higher' =  1

Text(0.5, 0, '(Years)')


# Define Figure
plt.figure(figsize=(7,7))
plt.title("Parametric vs. Nonparametric Fitting", color = "#4C9900")
plt.text(15, 0.92, '(CIs shown only for Weibull)', fontsize = 10, color = 'grey', style = 'normal')
plt.text(0, 0.55, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')
# Choose time variable from complete cases dataset
Tc = survival_complete["TIMEDTH"]/365
# Choose event variable from complete cases dataset
Ec = survival_complete["DEATH"]

# Plot nonparametric Kaplan-Meier fit
value_range = [0, 1]
color_range_kmf = ['g','b']
for value in value_range:
    disc = (survival_complete["higher"] == value)
    kmf_label = ("Kaplan-Meier: higher=" + str(value))
    kmf.fit(Tc[disc], event_observed=Ec[disc], label=kmf_label)
    kmf.plot_survival_function(ci_show=False, color=color_range_kmf[value])

# Plot parametric Weibull fit
from lifelines import WeibullFitter
wbf = WeibullFitter()
value_range = [0, 1]
print("WEIBULL MODEL SUMMARIES:")
color_range_llf = ['#ff80ff','#ff6600']
for value in value_range:
    disc = (survival_complete["higher"] == value)
    wbf_label = ("Weibull: higher="+str(value))
    wbf.fit(Tc[disc], event_observed=Ec[disc], label=(wbf_label))
    wbf.plot_survival_function(ci_show=True, color=color_range_llf[value])
    print("Covariate 'higher' = ", value)
    wbf.print_summary()
plt.xlabel("(Years)")

WEIBULL MODEL SUMMARIES:
Covariate 'higher' =  0

Covariate 'higher' =  1

Text(0.5, 0, '(Years)')


# Define Figure
plt.figure(figsize=(7,7))
plt.title("Arbitrary Piecewise Fitting", color = "#4C9900")
plt.text(14, 0.92, '(CIs omitted for clarity)', fontsize = 9, color = 'grey', style = 'normal')
plt.text(0.3, 0.57, 'mathias.elsner: CC BY-NC-SA', fontsize = 7, color = 'grey', style = 'italic')

# Plot nonparametric Kaplan-Meier fit
value_range = [0, 1]
color_range_kmf = ['g','b']
for value in value_range:
    disc = (survival_complete["higher"] == value)
    kmf_label = ("Kaplan-Meier: higher=" + str(value))
    kmf.fit(Tc[disc], event_observed=Ec[disc], label=kmf_label)
    kmf.plot_survival_function(ci_show=False, color=color_range_kmf[value], linewidth=1, alpha=0.8)

# Plot parametric Piecewise fit
from lifelines import PiecewiseExponentialFitter
brk_list=[4,10,13,19]
for value in brk_list:
    plt.axvline(x=value, color='darkgrey', linestyle=':')
pef = PiecewiseExponentialFitter(breakpoints=brk_list)
value_range = [0, 1]
print("PIECEWISE MODEL SUMMARIES:")
color_range_llf = ['m','r']
for value in value_range:
    disc = (survival_complete["higher"] == value)
    pef_label = ("Piecewise exponential: higher="+str(value))
    pef.fit(Tc[disc], event_observed=Ec[disc], label=(pef_label))
    pef.plot_survival_function(ci_show=False, color=color_range_llf[value], linewidth=2, alpha=0.7)
    print("Covariate 'higher' = ", value)
    pef.print_summary()
plt.xlabel("(Years)")

PIECEWISE MODEL SUMMARIES:
Covariate 'higher' =  0

Covariate 'higher' =  1

Text(0.5, 0, '(Years)')

model	lifelines.CoxPHFitter
duration col	'TIMEDTH'
event col	'DEATH'
baseline estimation	breslow
number of observations	4434
number of events observed	1550
partial log-likelihood	-12663.659
time fit was run	2023-05-05 18:39:59 UTC
model	untransformed variables

Concordance	0.558
Partial AIC	25329.317
log-likelihood ratio test	84.946 on 1 df
-log2(p) of ll-ratio test	64.822

model	lifelines.CoxPHFitter
duration col	'TIMEDTH'
event col	'DEATH'
baseline estimation	breslow
number of observations	4434
number of events observed	1550
partial log-likelihood	-12662.994
time fit was run	2023-05-05 18:39:59 UTC
model	untransformed variables

Concordance	0.559
Partial AIC	25327.989
log-likelihood ratio test	86.274 on 1 df
-log2(p) of ll-ratio test	65.791

model	lifelines.CoxPHFitter
duration col	'TIMEDTH'
event col	'DEATH'
baseline estimation	breslow
number of observations	4434
number of events observed	1550
partial log-likelihood	-12677.330
time fit was run	2023-05-05 18:39:59 UTC
model	untransformed variables

	TIMEDTH	DEATH	higher
0	False	False	False
1	False	False	False
2	False	False	False
3	False	False	False
4	False	False	False
...	...	...	...
4429	False	False	False
4430	False	False	False
4431	False	False	False
4432	False	False	False
4433	False	False	False

Concordance	0.548
Partial AIC	25356.659
log-likelihood ratio test	57.604 on 1 df
-log2(p) of ll-ratio test	44.826

Concordance	0.569
Partial AIC	25295.155
log-likelihood ratio test	119.108 on 1 df
-log2(p) of ll-ratio test	89.704

Datasets with educational overclassification of all NaNs	HR	95%-CI
1: Upper Limit (all missing values set to '1'):	0.625	0.566-0.690
2: Lower Limit (all missing values set to '0'):	0.623	0.564-0.689

Datasets with educational overclassification of NaN-fatalities	HR	95%-CI
3: Fatalities with NaN, values set to '1')†:	0.679	0.615-0.750
4: Fatalities with NaN, values set to '0')††:	0.573	0.519-0.634

Concordance	0.560
Partial AIC	24577.074
log-likelihood ratio test	87.784 on 1 df
-log2(p) of ll-ratio test	66.892

Perspective:	HR	95%-CI
Higher Education (hazard benefit):	0.616	0.557-0.682
Lower Education (hazard penalty):	1.623	1.466-1.795

t_0	-1
null_distribution	chi squared
degrees_of_freedom	1
test_name	logrank_test

time_transform	identity
null_distribution	chi squared
degrees_of_freedom	1
model	<lifelines.CoxPHFitter: fitted with 4321 total...
test_name	proportional_hazard_test

time_transform	rank
null_distribution	chi squared
degrees_of_freedom	1
model	<lifelines.CoxPHFitter: fitted with 4321 total...
test_name	proportional_hazard_test

Index	Name	Enter Beach	Exhibit Sunburn	Drop Out	Lost to f'up
0	Jack	10:00	yes (12:00)	no	no
1	Jill	11:00	no	no	no
2	Joe	11:00	yes (17:00)	no	no
3	Jane	10:00	yes (13:00)	yes (14:00)	no
4	Huey	12:00	no	yes (13:00)	no
5	Dewey	10:00	yes† (14:00)	no	yes (??:??)
6	Louie	08:00	yes (12:00)	no	no

	coef	se(coef)	coef lower 95%	coef upper 95%	cmp to	z	p	-log2(p)
lambda_	35.34	1.00	33.38	37.31	1.00	34.31	<0.005	854.40
rho_	1.54	0.05	1.43	1.64	1.00	10.33	<0.005	80.71

	coef	se(coef)	coef lower 95%	coef upper 95%	cmp to	z	p	-log2(p)
lambda_	48.12	1.73	44.73	51.51	1.00	27.26	<0.005	541.33
rho_	1.54	0.06	1.43	1.65	1.00	9.86	<0.005	73.79

	coef	se(coef)	coef lower 95%	coef upper 95%	cmp to	z	p	-log2(p)
lambda_0_	101.48	12.03	77.89	125.06	1.00	8.35	<0.005	53.71
lambda_1_	65.50	5.29	55.14	75.86	1.00	12.20	<0.005	111.29
lambda_2_	51.88	5.45	41.20	62.56	1.00	9.34	<0.005	66.44
lambda_3_	32.44	2.03	28.46	36.41	1.00	15.50	<0.005	177.66
lambda_4_	27.05	1.86	23.41	30.69	1.00	14.04	<0.005	146.25

model	lifelines.WeibullFitter
number of observations	1822
number of events observed	780
log-likelihood	-3697.51
hypothesis	lambda_ != 1, rho_ != 1

model	lifelines.PiecewiseExponentialFitter
number of observations	1822
number of events observed	780
log-likelihood	-3686.97
hypothesis	lambda_0_ != 1, lambda_1_ != 1, lambda_2_ != 1...

	coef	se(coef)	coef lower 95%	coef upper 95%	cmp to	z	p	-log2(p)
lambda_0_	172.29	22.67	127.86	216.72	1.00	7.56	<0.005	44.45
lambda_1_	103.92	8.88	86.52	121.32	1.00	11.59	<0.005	100.82
lambda_2_	67.08	6.69	53.97	80.20	1.00	9.88	<0.005	74.04
lambda_3_	63.05	4.45	54.32	71.79	1.00	13.93	<0.005	144.14
lambda_4_	40.59	2.66	35.38	45.81	1.00	14.88	<0.005	163.85

Open Source Survival Analysis¶

(Framingham à la Dilettante)¶

What's on this Webpage?¶

Why Framingham?¶

Why Dilettante?¶

Why Python with Lifelines in JupyterLab?¶

Study Data¶

Source¶

Dataset¶

Caveat¶

Disclaimer¶

Stop Blathering, Get Going!¶

Clinical Endpoints¶

Mortality rules!¶

Survival Analysis in Python¶

Plotting library¶

Displaying Graphs in Jupyter Notebook¶

Temporal Distribution of Overall Mortality Events¶

Kaplan Meier Survival Curve¶

Exploring Covariates¶

First Example: Diabetes mellitus¶

Effect of Diabetes on Cardiovascular Events¶

Educational Status: The Silent Killer?¶

What about post hoc grouping for clarity?¶

What would be a viable prespecified statistical analysis?¶

What next?¶

Post hoc analysis: Higher Education (yes/no)¶

Intro: At Risk Tables¶

Let's have a closer look at the at risk table:¶

Beware of Scales...¶

Now, what about some real statistics?¶

Data Scrutinizing and Cleansing¶

WTF are NaN ?¶

What about NaN in our dataset?¶

Sensitivity Analysis¶

Cox Proportional Hazards Model (I: Limit Analysis)¶

Dataset 1: Upper limit (higher education for all NaN)¶

Dataset 2: Lower limit (lower education for all NaN)¶

Dataset 3: Presume higher education for all fatalities with NaN¶

Dataset 4: Presume lower education for all fatalities with NaN¶

Results Summary for Limit Analysis:¶

Cox Proportional Hazards Model (II: Clinical Analysis)¶

Dataset: Complete Cases¶

Results Summary for Clinical Analysis:¶

Wait, what about assumptions?¶

Check Proportional Hazards Assumption¶

Testing for Statistical Significance¶

Log-Rank Test¶

The "Three Pillars of Survival Analysis"¶

Mini-Appendix (A): Nelson-Aalen Estimator of Hazard¶

Mini Appendix (B): Of Hazard, Cumulative Hazard and Survival...¶

Visualized With Framingham Data¶

Mini Appendix (C): Testing Proportional Hazards Assumption¶

Background¶

Testing the PH assumption in lifelines¶

What about visualization?¶

Data preparation¶

Plotting scaled Schoenfeld residuals¶

Postscriptum to Mini Appendix (C)¶

Yup! 👍 😃¶

Mini Appendix (D): Cursory Introduction to Survival Analysis¶

What is Survival Analysis?¶

What is Censoring?¶

Example: "Silly Sunburn Study"¶

Data Preparation (Part I)¶

Visualization of Raw Data¶

Data Preparation (Part II)¶

Data Preparation (Part III)¶

Survival Curve¶

What next?¶

Survival Analysis: Outdated or Timeless?¶

Mini Appendix (E): Showcase Parametric Modelling¶

Overfitting the Data¶