Part 2 - Patient Similarity Networks (PSNs)

Part 2 - Patient Similarity Networks (PSNs)#

Table of Contents#

Part 2.

Patient Similarity Network Construction
DNA Methylation Network Analysis

# standard libraries
import os
import pickle

# scientific and data manipulation libraries
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.feature_selection import mutual_info_regression
from astropy.stats import median_absolute_deviation
import mygene
import astropy

# graph and network libraries
import networkx as nx

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.io as pio
from IPython.display import Image
from IPython.display import display

import warnings
warnings.filterwarnings('ignore')

# import custom functions from the previous notebook
import sys
sys.path.insert(0 , '/tutorial/')
from functions import *

10. Patient Similarity Network (PSN)#

Based on the same expression matrix we can create a patient similarity network.
Transposing the matrix will switch the rows and columns,
meaning that patients will become the columns instead of genes
By doing this, you can compute the correlation (or similarity) between patients based on their gene expression profiles,
and then create a network where nodes represent patients and edges represent similarities.

# main data directories for the project

raw_data_dir = '/data/raw'
intermediate_data_dir = '/data/intermediate'

# read in os.path.join(intermediate_data_dir,"expression_data_filtered.csv")
df_renamed = pd.read_csv(os.path.join(intermediate_data_dir,
                                      "expression_data_filtered.csv"),
                                      index_col=0)

'''
#################################################
                YOUR CODE HERE
#################################################

We will now transpose the df_renamed df so that the rows represent the genes and the columns represent the patients.
Let's call the transposed df patient_gene_matrix.

'''

patient_gene_matrix = df_renamed.T

patient_gene_matrix

	TCGA-38-7271	TCGA-55-7914	TCGA-95-7043	TCGA-73-4658	TCGA-86-8076	TCGA-55-7726	TCGA-44-6147	TCGA-50-5932	TCGA-44-2661	TCGA-86-7954	...	TCGA-97-A4M7	TCGA-62-A46R	TCGA-50-5055	TCGA-38-4628	TCGA-86-7713	TCGA-86-8073	TCGA-MN-A4N4	TCGA-53-7626	TCGA-44-A47G	TCGA-55-6969
A2M	17.7492	14.8513	14.1691	16.7238	15.6783	14.7566	16.4368	15.5476	15.5478	15.1337	...	15.9553	13.9511	16.3097	14.3934	15.8254	16.3773	14.9411	16.7343	15.6622	14.8136
A2ML1	4.4411	4.4530	4.5026	3.1704	4.7422	6.0918	4.0602	3.0827	3.4608	3.6367	...	4.7076	2.4821	8.0810	3.3415	5.3379	4.0817	6.7210	5.2575	3.7345	4.3058
A4GALT	10.1862	8.9312	9.0834	9.1443	9.5150	11.4452	9.1462	7.3597	9.1602	8.9344	...	9.1019	8.6653	10.8837	8.6304	9.3827	9.4234	11.9107	9.5379	9.8660	9.7702
AACSP1	3.5845	3.6762	3.8623	2.4821	3.2271	2.4821	5.6614	3.0827	3.9428	4.7089	...	5.4947	3.0956	2.4821	3.7710	2.9397	6.6519	8.8283	2.4821	4.1045	6.8425
AADAC	9.6415	6.5685	5.5634	4.3206	7.2321	6.8821	8.1176	4.7935	11.7750	7.0644	...	9.0856	3.8162	5.6471	4.6785	4.6578	6.7641	3.4947	5.7120	6.2214	5.2016
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
ZSCAN23	5.7385	4.3796	5.6728	3.6533	5.0478	6.0489	5.8058	3.7897	4.3451	5.5259	...	4.7076	2.4821	4.6683	3.4697	4.3516	7.2074	6.9167	6.0927	5.3854	3.6792
ZSCAN31	9.0817	10.9565	10.8095	8.9874	9.5810	9.0185	10.2085	9.8803	11.0327	10.2707	...	9.2033	10.5020	8.8073	9.8672	10.6419	10.2410	9.4672	10.0507	7.4843	8.5268
ZSCAN4	3.2709	3.0918	2.4821	3.4468	3.0118	5.0894	5.5341	3.5082	5.4987	3.3645	...	4.4088	7.4937	4.3167	5.0907	6.7676	5.1126	3.3143	4.2816	4.4650	2.4821
ZWINT	9.2615	10.1334	10.9149	9.7828	8.6640	11.1212	9.0718	10.7305	9.0013	10.4268	...	9.7614	10.7593	9.7087	10.4729	10.9794	10.1390	10.0583	9.4627	9.5817	10.8228
ZYG11A	4.0073	8.3219	6.7593	3.4468	4.7844	5.0894	9.4014	4.7935	4.2748	7.1709	...	6.5412	7.1933	5.8778	7.3822	8.7596	8.2478	8.4566	7.4014	4.6804	8.6068

5338 rows × 498 columns

'''
#################################################
                YOUR CODE HERE
#################################################

We will now calculate the correlation matrix for the patient_gene_matrix using the Pearson correlation method.
Store the correlation matrix in a dictionary called patient_correlation_matrices with the key 'pearson'.
We don't have to do it, however if you want to calculate the correlation matrix using other methods,
you can do so and store them in the dictionary as well.

'''

# Dictionary to store different correlation matrices
patient_correlation_matrices = {}

# Pearson correlation
patient_correlation_matrices['pearson'] = patient_gene_matrix.corr(method='pearson')

'''
#################################################
                YOUR CODE HERE
#################################################

Create a graph from the correlation matrix using the create_graph_from_correlation function.
Set the threshold to 0.8.
Store the graph in a variable called patient_pearson_graph.

'''

patient_pearson_graph = create_graph_from_correlation(patient_correlation_matrices['pearson'], threshold=0.8)

'''
#################################################
                YOUR CODE HERE
#################################################

Visualie the graph using the visualise_graph function.
Use appropriate title for the graph as the second argument.

'''
visualise_graph(patient_pearson_graph, title='Pearson Correlation Network (Threshold = 0.8)')

../_images/7c5a40fe5e7facad802d597ade95ddeb8f4804da9f09747204211eb8bb5c6d88.png

'''
#################################################
                YOUR CODE HERE
#################################################

Now use clean_graph function to clean the graph called patient_pearson_graph_pruned.
Consider the following parameters:
- degree_threshold
- keep_largest_component

'''
patient_pearson_graph_pruned = clean_graph(patient_pearson_graph,
                                    degree_threshold=1,
                                    keep_largest_component=True)

'''
#################################################
                YOUR CODE HERE
#################################################

Visualie the pruned graph using the visualise_graph function.
Use appropriate title for the graph as the second argument.

'''

visualise_graph(patient_pearson_graph_pruned,
                title='Pearson Patient Correlation Network (Threshold = 0.8)')

../_images/65fb8d04761529e69a566eeb0bf9221fbeae870380c1adb85a826325afb465bc.png

'''
#################################################
                YOUR CODE HERE
#################################################

 Now do some sparsification of the graph using knn_sparsification function,
 call it patient_pearson_graph_pruned_knn.
 Set the k value to 10.               

'''

patient_pearson_graph_pruned_knn = knn_sparsification(patient_pearson_graph_pruned, k=10)

'''
#################################################
                YOUR CODE HERE
#################################################

Let's see some information about the graph using the print_graph_info function.
First, print the information about the patient_pearson_graph_pruned graph.
Use print("------------------------------------"), as a devide between the two graphs.
Then, print the information about the patient_pearson_graph_pruned_knn graph.            

'''

print_graph_info(patient_pearson_graph_pruned)
print("------------------------------------")
print_graph_info(patient_pearson_graph_pruned_knn)

Number of nodes: 434
Number of edges: 10373
Sample nodes: ['TCGA-38-7271', 'TCGA-55-7914', 'TCGA-73-4658', 'TCGA-86-8076', 'TCGA-55-7726', 'TCGA-44-6147', 'TCGA-50-5932', 'TCGA-44-2661', 'TCGA-86-7954', 'TCGA-73-4662']
Sample edges: [('TCGA-38-7271', 'TCGA-73-4658', {'weight': 0.8696511707884843}), ('TCGA-38-7271', 'TCGA-86-8076', {'weight': 0.8173779911639248}), ('TCGA-38-7271', 'TCGA-44-2661', {'weight': 0.8652686243802332}), ('TCGA-38-7271', 'TCGA-73-4662', {'weight': 0.8446135021161085}), ('TCGA-38-7271', 'TCGA-55-6986', {'weight': 0.833876175940164}), ('TCGA-38-7271', 'TCGA-49-6744', {'weight': 0.8919809060862212}), ('TCGA-38-7271', 'TCGA-69-7763', {'weight': 0.8374050473870502}), ('TCGA-38-7271', 'TCGA-44-6774', {'weight': 0.802120255633156}), ('TCGA-38-7271', 'TCGA-67-3774', {'weight': 0.8179727328277505}), ('TCGA-38-7271', 'TCGA-97-A4M2', {'weight': 0.8045412817214057})]
Graph type: undirected
No self-loops in the graph.
Graph density: 0.11039686678515555
Number of connected components: 1
Average clustering coefficient: 0.6049320008124544
------------------------------------
Number of nodes: 434
Number of edges: 2956
Sample nodes: ['TCGA-38-7271', 'TCGA-55-7914', 'TCGA-73-4658', 'TCGA-86-8076', 'TCGA-55-7726', 'TCGA-44-6147', 'TCGA-50-5932', 'TCGA-44-2661', 'TCGA-86-7954', 'TCGA-73-4662']
Sample edges: [('TCGA-38-7271', 'TCGA-53-7626', {'weight': 0.8932510135537189}), ('TCGA-38-7271', 'TCGA-49-6744', {'weight': 0.8919809060862212}), ('TCGA-38-7271', 'TCGA-78-8648', {'weight': 0.8857601471754217}), ('TCGA-38-7271', 'TCGA-86-8671', {'weight': 0.8825367028721114}), ('TCGA-38-7271', 'TCGA-50-5941', {'weight': 0.8814255405956031}), ('TCGA-38-7271', 'TCGA-55-7574', {'weight': 0.8799541588637929}), ('TCGA-38-7271', 'TCGA-97-7553', {'weight': 0.8775024097687647}), ('TCGA-38-7271', 'TCGA-50-5055', {'weight': 0.8726819735362153}), ('TCGA-38-7271', 'TCGA-50-5045', {'weight': 0.8723464616125998}), ('TCGA-38-7271', 'TCGA-73-4658', {'weight': 0.8696511707884843})]
Graph type: undirected
No self-loops in the graph.
Graph density: 0.03145986100616213
Number of connected components: 1
Average clustering coefficient: 0.3799854027043237

visualise_graph(patient_pearson_graph_pruned_knn, title='K-Nearest Neighbors (k=10) Patient Correlation Network')

../_images/0a80454154d1534eb2237cb505d957f669845692c3fff42a699727b371fa0564.png

11. DNA methylation PSN#

In the second task, we are preparing an additional network for the same patients, this time based on DNA methylation data.

# Load the data using pickle from the ISMB_TCGA_DNAm.pkl file
with open(os.path.join(raw_data_dir,"ISMB_TCGA_DNAm.pkl") , 'rb') as file : 
    data = pd.read_pickle(file)

# Extract the methylation data from the dictionary similar to the previous data
meth_data = data["datExpr"]
meth_data

	cg23057992	cg16602369	cg20545544	cg04571941	cg23091104	cg11738485	cg17735539	cg12662576	cg10554839	cg08866780	...	cg04339424	cg01318691	cg27068965	cg23028848	cg21122529	cg22254072	cg10237911	cg09874052	cg27624178	cg02857943
TCGA-55-7914	0.039144	0.078881	0.032201	0.098520	0.971626	0.020020	0.929670	0.033325	0.043767	0.028051	...	0.053027	0.046047	0.951328	0.045767	0.048185	0.014262	0.034541	0.034967	0.041538	0.053116
TCGA-38-4631	0.954805	0.047287	0.027976	0.925794	0.144636	0.365842	0.156414	0.027917	0.047609	0.049901	...	0.048398	0.047221	0.939891	0.056141	0.047459	0.014548	0.056591	0.039086	0.044946	0.055473
TCGA-73-4658	0.966071	0.126549	0.030316	0.945166	0.976080	0.975638	0.942018	0.066659	0.052100	0.047054	...	0.040815	0.042040	0.951235	0.045477	0.074037	0.019053	0.029978	0.039345	0.060650	0.041385
TCGA-50-5932	0.965140	0.061194	0.935457	0.923421	0.965443	0.027598	0.040885	0.036178	0.045745	0.046439	...	0.057840	0.045106	0.948255	0.044558	0.039484	0.013837	0.037048	0.032503	0.058576	0.037666
TCGA-55-7576	0.958612	0.071607	0.027077	0.940574	0.976870	0.785073	0.953674	0.945107	0.056088	0.955472	...	0.046921	0.051442	0.954000	0.034352	0.031497	0.014924	0.035520	0.043547	0.056388	0.052208
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
TCGA-50-8457	0.960316	0.940219	0.046927	0.942980	0.977751	0.633731	0.940723	0.053398	0.059292	0.968875	...	0.047640	0.062680	0.954824	0.043137	0.048599	0.014066	0.031109	0.039247	0.035110	0.054273
TCGA-91-6840	0.964250	0.921250	0.957943	0.958416	0.981439	0.278432	0.968525	0.032143	0.083996	0.037912	...	0.045197	0.066271	0.963420	0.047621	0.060854	0.015823	0.045674	0.040006	0.052008	0.040826
TCGA-86-8280	0.948855	0.942925	0.958506	0.945667	0.970511	0.544447	0.921924	0.939427	0.082121	0.045036	...	0.062367	0.058898	0.954355	0.045702	0.048930	0.019321	0.039341	0.036627	0.054269	0.056795
TCGA-95-A4VK	0.037255	0.050940	0.036565	0.098237	0.976780	0.683394	0.960073	0.025232	0.062487	0.026908	...	0.056248	0.053591	0.947020	0.035344	0.057783	0.010943	0.051868	0.035566	0.044910	0.042269
TCGA-55-6969	0.945954	0.068404	0.923325	0.930684	0.962295	0.037952	0.153943	0.033980	0.056811	0.050211	...	0.048765	0.061228	0.930596	0.056050	0.061802	0.017861	0.044229	0.054682	0.045090	0.064186

459 rows × 300000 columns

# load the data from the pickle file ISMB_TCGA_GE.pkl and call it GE_data
with open(os.path.join(raw_data_dir,"ISMB_TCGA_GE.pkl"), 'rb') as file:
    GE_data = pickle.load(file)

# A reminder about the structure of the GE_data, we can get a list of the patients using the following code
GE_data["datMeta"]["patient"].to_list()

['TCGA-38-7271',
 'TCGA-55-7914',
 'TCGA-95-7043',
 'TCGA-73-4658',
 'TCGA-86-8076',
 'TCGA-55-7726',
 'TCGA-44-6147',
 'TCGA-50-5932',
 'TCGA-44-2661',
 'TCGA-86-7954',
 'TCGA-73-4662',
 'TCGA-44-7671',
 'TCGA-78-8660',
 'TCGA-62-A46P',
 'TCGA-55-6978',
 'TCGA-50-6592',
 'TCGA-38-4625',
 'TCGA-80-5611',
 'TCGA-86-8054',
 'TCGA-55-6986',
 'TCGA-L9-A5IP',
 'TCGA-69-7764',
 'TCGA-49-6744',
 'TCGA-75-5125',
 'TCGA-38-4626',
 'TCGA-69-7763',
 'TCGA-86-8279',
 'TCGA-93-8067',
 'TCGA-97-8179',
 'TCGA-55-A48Y',
 'TCGA-86-8055',
 'TCGA-91-6835',
 'TCGA-55-6982',
 'TCGA-55-A4DF',
 'TCGA-44-6774',
 'TCGA-50-5066',
 'TCGA-05-5423',
 'TCGA-67-3774',
 'TCGA-97-A4M2',
 'TCGA-95-7567',
 'TCGA-49-AAR0',
 'TCGA-44-2656',
 'TCGA-53-7813',
 'TCGA-O1-A52J',
 'TCGA-35-4122',
 'TCGA-55-8092',
 'TCGA-49-6761',
 'TCGA-49-4507',
 'TCGA-55-7816',
 'TCGA-78-7145',
 'TCGA-55-6983',
 'TCGA-53-7624',
 'TCGA-97-A4M3',
 'TCGA-50-5068',
 'TCGA-78-8648',
 'TCGA-44-6778',
 'TCGA-80-5608',
 'TCGA-86-8281',
 'TCGA-MP-A4T6',
 'TCGA-55-8085',
 'TCGA-62-8399',
 'TCGA-97-A4M5',
 'TCGA-97-7547',
 'TCGA-05-5429',
 'TCGA-55-7994',
 'TCGA-55-8094',
 'TCGA-05-4425',
 'TCGA-44-4112',
 'TCGA-49-6767',
 'TCGA-49-4490',
 'TCGA-MP-A4T9',
 'TCGA-50-5942',
 'TCGA-MP-A4SV',
 'TCGA-49-AAR4',
 'TCGA-05-4397',
 'TCGA-44-A47A',
 'TCGA-86-8359',
 'TCGA-78-7539',
 'TCGA-MP-A4T8',
 'TCGA-99-8032',
 'TCGA-50-6595',
 'TCGA-55-6968',
 'TCGA-44-8120',
 'TCGA-55-8302',
 'TCGA-99-8025',
 'TCGA-64-1679',
 'TCGA-95-8039',
 'TCGA-44-A479',
 'TCGA-44-6148',
 'TCGA-NJ-A55O',
 'TCGA-MP-A5C7',
 'TCGA-64-5778',
 'TCGA-55-6971',
 'TCGA-49-AARN',
 'TCGA-44-A47B',
 'TCGA-55-5899',
 'TCGA-49-AAQV',
 'TCGA-99-8028',
 'TCGA-75-6205',
 'TCGA-97-8552',
 'TCGA-50-8459',
 'TCGA-05-5425',
 'TCGA-78-7150',
 'TCGA-86-A4P7',
 'TCGA-49-4512',
 'TCGA-55-8206',
 'TCGA-55-8614',
 'TCGA-64-5815',
 'TCGA-L9-A50W',
 'TCGA-73-4675',
 'TCGA-55-7995',
 'TCGA-05-4433',
 'TCGA-55-7727',
 'TCGA-44-2668',
 'TCGA-44-A4SU',
 'TCGA-55-7907',
 'TCGA-69-7765',
 'TCGA-49-4487',
 'TCGA-44-2662',
 'TCGA-67-6216',
 'TCGA-55-7283',
 'TCGA-86-8280',
 'TCGA-91-6840',
 'TCGA-78-7154',
 'TCGA-49-4488',
 'TCGA-93-7348',
 'TCGA-62-A470',
 'TCGA-78-7147',
 'TCGA-50-5936',
 'TCGA-55-6984',
 'TCGA-50-5941',
 'TCGA-69-7978',
 'TCGA-78-7220',
 'TCGA-55-8616',
 'TCGA-44-A4SS',
 'TCGA-55-7570',
 'TCGA-78-7146',
 'TCGA-44-3398',
 'TCGA-05-5420',
 'TCGA-50-5072',
 'TCGA-05-4396',
 'TCGA-05-4405',
 'TCGA-50-5935',
 'TCGA-38-4629',
 'TCGA-55-8619',
 'TCGA-05-4410',
 'TCGA-73-4676',
 'TCGA-97-8172',
 'TCGA-44-7661',
 'TCGA-05-4384',
 'TCGA-44-2655',
 'TCGA-80-5607',
 'TCGA-67-3770',
 'TCGA-91-6836',
 'TCGA-95-7562',
 'TCGA-55-8511',
 'TCGA-44-6776',
 'TCGA-95-7948',
 'TCGA-91-7771',
 'TCGA-50-5944',
 'TCGA-MN-A4N5',
 'TCGA-73-4677',
 'TCGA-78-7540',
 'TCGA-91-6829',
 'TCGA-78-8640',
 'TCGA-62-8398',
 'TCGA-55-8512',
 'TCGA-83-5908',
 'TCGA-55-6987',
 'TCGA-93-A4JP',
 'TCGA-73-A9RS',
 'TCGA-L4-A4E5',
 'TCGA-86-8074',
 'TCGA-86-8358',
 'TCGA-78-7158',
 'TCGA-91-8497',
 'TCGA-49-AARO',
 'TCGA-78-7159',
 'TCGA-55-7227',
 'TCGA-86-7714',
 'TCGA-L9-A7SV',
 'TCGA-78-7143',
 'TCGA-91-8499',
 'TCGA-49-AAR3',
 'TCGA-55-8620',
 'TCGA-69-8255',
 'TCGA-75-6207',
 'TCGA-62-A46Y',
 'TCGA-NJ-A4YF',
 'TCGA-91-6830',
 'TCGA-62-8395',
 'TCGA-49-4486',
 'TCGA-44-6145',
 'TCGA-86-A4P8',
 'TCGA-78-7537',
 'TCGA-44-3919',
 'TCGA-35-4123',
 'TCGA-62-8394',
 'TCGA-69-7761',
 'TCGA-62-A46U',
 'TCGA-97-8547',
 'TCGA-97-7554',
 'TCGA-50-6673',
 'TCGA-95-7039',
 'TCGA-95-7944',
 'TCGA-55-8301',
 'TCGA-78-7152',
 'TCGA-05-4390',
 'TCGA-44-7659',
 'TCGA-97-7941',
 'TCGA-49-4514',
 'TCGA-55-A490',
 'TCGA-55-8508',
 'TCGA-MP-A4TE',
 'TCGA-97-A4M1',
 'TCGA-75-6206',
 'TCGA-86-8671',
 'TCGA-78-7160',
 'TCGA-64-1681',
 'TCGA-49-4494',
 'TCGA-50-5946',
 'TCGA-55-7913',
 'TCGA-44-6779',
 'TCGA-49-AARE',
 'TCGA-05-4403',
 'TCGA-99-AA5R',
 'TCGA-73-4659',
 'TCGA-50-8457',
 'TCGA-75-5147',
 'TCGA-55-8507',
 'TCGA-78-7536',
 'TCGA-95-A4VK',
 'TCGA-38-4627',
 'TCGA-67-6215',
 'TCGA-69-7973',
 'TCGA-05-5715',
 'TCGA-75-7030',
 'TCGA-44-2666',
 'TCGA-62-A472',
 'TCGA-55-6985',
 'TCGA-J2-A4AG',
 'TCGA-97-A4LX',
 'TCGA-55-6543',
 'TCGA-97-7938',
 'TCGA-35-3615',
 'TCGA-44-2657',
 'TCGA-55-8505',
 'TCGA-97-8175',
 'TCGA-49-4510',
 'TCGA-MP-A4TK',
 'TCGA-50-5044',
 'TCGA-49-AAR2',
 'TCGA-05-4250',
 'TCGA-97-7937',
 'TCGA-64-5774',
 'TCGA-86-8674',
 'TCGA-50-6590',
 'TCGA-64-5779',
 'TCGA-75-5146',
 'TCGA-44-6144',
 'TCGA-78-8662',
 'TCGA-55-7911',
 'TCGA-86-8585',
 'TCGA-38-A44F',
 'TCGA-64-1680',
 'TCGA-97-A4M6',
 'TCGA-75-6214',
 'TCGA-55-6980',
 'TCGA-97-7553',
 'TCGA-55-A48X',
 'TCGA-64-1677',
 'TCGA-73-7498',
 'TCGA-38-6178',
 'TCGA-44-7670',
 'TCGA-62-A471',
 'TCGA-49-4505',
 'TCGA-NJ-A7XG',
 'TCGA-55-6981',
 'TCGA-91-6848',
 'TCGA-55-8090',
 'TCGA-55-7725',
 'TCGA-55-8207',
 'TCGA-44-6146',
 'TCGA-05-4434',
 'TCGA-55-6979',
 'TCGA-05-4427',
 'TCGA-55-8615',
 'TCGA-50-5939',
 'TCGA-05-4418',
 'TCGA-67-6217',
 'TCGA-49-6745',
 'TCGA-55-1595',
 'TCGA-49-6742',
 'TCGA-05-4402',
 'TCGA-05-4382',
 'TCGA-55-7576',
 'TCGA-67-3773',
 'TCGA-78-7633',
 'TCGA-50-6597',
 'TCGA-44-2659',
 'TCGA-95-7947',
 'TCGA-55-7724',
 'TCGA-J2-A4AD',
 'TCGA-55-8091',
 'TCGA-55-1592',
 'TCGA-73-4670',
 'TCGA-55-1594',
 'TCGA-55-8621',
 'TCGA-50-5051',
 'TCGA-49-4501',
 'TCGA-J2-8194',
 'TCGA-44-8119',
 'TCGA-55-8203',
 'TCGA-97-8177',
 'TCGA-55-7573',
 'TCGA-55-8089',
 'TCGA-38-4630',
 'TCGA-78-7166',
 'TCGA-38-4631',
 'TCGA-55-1596',
 'TCGA-91-A4BD',
 'TCGA-67-3771',
 'TCGA-J2-8192',
 'TCGA-55-A48Z',
 'TCGA-97-8176',
 'TCGA-86-6851',
 'TCGA-50-5931',
 'TCGA-NJ-A4YI',
 'TCGA-97-7552',
 'TCGA-MP-A4T4',
 'TCGA-L4-A4E6',
 'TCGA-44-5643',
 'TCGA-MP-A4TD',
 'TCGA-05-4244',
 'TCGA-50-5930',
 'TCGA-44-6777',
 'TCGA-05-4430',
 'TCGA-05-4426',
 'TCGA-44-6775',
 'TCGA-05-4420',
 'TCGA-55-8506',
 'TCGA-44-7672',
 'TCGA-62-8402',
 'TCGA-86-8056',
 'TCGA-05-4422',
 'TCGA-75-7027',
 'TCGA-78-7535',
 'TCGA-50-8460',
 'TCGA-75-7031',
 'TCGA-L9-A444',
 'TCGA-73-4666',
 'TCGA-86-7953',
 'TCGA-86-A4D0',
 'TCGA-NJ-A4YQ',
 'TCGA-91-8496',
 'TCGA-67-3772',
 'TCGA-55-7281',
 'TCGA-05-4424',
 'TCGA-69-A59K',
 'TCGA-75-7025',
 'TCGA-55-8514',
 'TCGA-95-8494',
 'TCGA-05-4395',
 'TCGA-93-A4JQ',
 'TCGA-44-8117',
 'TCGA-55-8204',
 'TCGA-50-5933',
 'TCGA-MN-A4N1',
 'TCGA-55-7903',
 'TCGA-86-8669',
 'TCGA-55-6970',
 'TCGA-50-6594',
 'TCGA-86-8075',
 'TCGA-MP-A4TF',
 'TCGA-69-7760',
 'TCGA-78-7161',
 'TCGA-91-6849',
 'TCGA-99-8033',
 'TCGA-55-8205',
 'TCGA-55-8510',
 'TCGA-91-6828',
 'TCGA-50-5049',
 'TCGA-99-7458',
 'TCGA-49-AARR',
 'TCGA-MP-A4TC',
 'TCGA-49-AARQ',
 'TCGA-93-A4JN',
 'TCGA-95-A4VP',
 'TCGA-69-8453',
 'TCGA-55-8513',
 'TCGA-55-6975',
 'TCGA-86-8278',
 'TCGA-75-6203',
 'TCGA-49-6743',
 'TCGA-55-7574',
 'TCGA-78-8655',
 'TCGA-05-4389',
 'TCGA-86-7701',
 'TCGA-91-6831',
 'TCGA-55-7728',
 'TCGA-97-A4M0',
 'TCGA-50-6593',
 'TCGA-86-6562',
 'TCGA-62-A46S',
 'TCGA-86-A4JF',
 'TCGA-J2-A4AE',
 'TCGA-55-8087',
 'TCGA-78-7542',
 'TCGA-78-7148',
 'TCGA-55-A493',
 'TCGA-91-A4BC',
 'TCGA-05-4432',
 'TCGA-55-6712',
 'TCGA-4B-A93V',
 'TCGA-93-7347',
 'TCGA-86-8668',
 'TCGA-MP-A4TI',
 'TCGA-69-8254',
 'TCGA-64-1676',
 'TCGA-62-A46V',
 'TCGA-78-7167',
 'TCGA-55-7284',
 'TCGA-78-7162',
 'TCGA-75-6212',
 'TCGA-97-7546',
 'TCGA-44-5644',
 'TCGA-55-8299',
 'TCGA-75-6211',
 'TCGA-MP-A4SW',
 'TCGA-78-7149',
 'TCGA-S2-AA1A',
 'TCGA-95-A4VN',
 'TCGA-55-8096',
 'TCGA-05-4398',
 'TCGA-97-8171',
 'TCGA-55-A492',
 'TCGA-71-8520',
 'TCGA-44-3396',
 'TCGA-55-A57B',
 'TCGA-L9-A443',
 'TCGA-55-A4DG',
 'TCGA-67-4679',
 'TCGA-64-5781',
 'TCGA-93-A4JO',
 'TCGA-NJ-A4YP',
 'TCGA-69-7974',
 'TCGA-MP-A4T7',
 'TCGA-55-6642',
 'TCGA-49-AAR9',
 'TCGA-69-8253',
 'TCGA-05-4249',
 'TCGA-44-7669',
 'TCGA-71-6725',
 'TCGA-73-7499',
 'TCGA-MP-A4TH',
 'TCGA-55-8208',
 'TCGA-78-7156',
 'TCGA-75-5122',
 'TCGA-50-7109',
 'TCGA-NJ-A55R',
 'TCGA-53-A4EZ',
 'TCGA-NJ-A4YG',
 'TCGA-86-A456',
 'TCGA-38-4632',
 'TCGA-MP-A4TJ',
 'TCGA-97-8174',
 'TCGA-MP-A4SY',
 'TCGA-62-8397',
 'TCGA-L9-A8F4',
 'TCGA-75-5126',
 'TCGA-MP-A4TA',
 'TCGA-86-7711',
 'TCGA-50-5045',
 'TCGA-05-4417',
 'TCGA-44-7660',
 'TCGA-69-7979',
 'TCGA-55-A491',
 'TCGA-L9-A743',
 'TCGA-55-A494',
 'TCGA-44-7662',
 'TCGA-55-7910',
 'TCGA-69-7980',
 'TCGA-55-8097',
 'TCGA-73-4668',
 'TCGA-NJ-A55A',
 'TCGA-86-8672',
 'TCGA-86-8673',
 'TCGA-78-7153',
 'TCGA-97-A4M7',
 'TCGA-62-A46R',
 'TCGA-50-5055',
 'TCGA-38-4628',
 'TCGA-86-7713',
 'TCGA-86-8073',
 'TCGA-MN-A4N4',
 'TCGA-53-7626',
 'TCGA-44-A47G',
 'TCGA-55-6969']

We don’t want to include all CpG sites in our analysis, so we are using dataset from the EWAS Catalog that contains smoking related CpG sites.

smoking_df = pd.read_csv(os.path.join(raw_data_dir,"smoking.tsv"),
                          delimiter='\t')
smoking_df

	author	consortium	pmid	date	trait	efo	analysis	source	outcome	exposure	...	chrpos	chr	pos	gene	type	beta	se	p	details	study_id
0	Sikdar S	CHARGE	31536415	2019-09-19	Smoking	EFO_0009115, EFO_0006527, EFO_0004318	random effects meta-analysis	Table S2	DNA methylation	Smoking	...	chr2:233284934	2	233284934	-	Island	-0.08545	0.00131	0.000000	NaN	31536415_Sikdar-S_smoking_random_effects_meta-...
1	Sikdar S	CHARGE	31536415	2019-09-19	Smoking	EFO_0009115, EFO_0006527, EFO_0004318	random effects meta-analysis	Table S2	DNA methylation	Smoking	...	chr19:17000585	19	17000585	F2RL3	North shore	-0.07233	0.00129	0.000000	NaN	31536415_Sikdar-S_smoking_random_effects_meta-...
2	Sikdar S	CHARGE	31536415	2019-09-19	Smoking	EFO_0009115, EFO_0006527, EFO_0004318	random effects meta-analysis	Table S2	DNA methylation	Smoking	...	chr5:373378	5	373378	AHRR	North shore	-0.03138	0.00078	0.000000	NaN	31536415_Sikdar-S_smoking_random_effects_meta-...
3	Sikdar S	CHARGE	31536415	2019-09-19	Smoking	EFO_0009115, EFO_0006527, EFO_0004318	random effects meta-analysis	Table S2	DNA methylation	Smoking	...	chr2:233284402	2	233284402	-	Island	-0.07381	0.00132	0.000000	NaN	31536415_Sikdar-S_smoking_random_effects_meta-...
4	Sikdar S	CHARGE	31536415	2019-09-19	Smoking	EFO_0009115, EFO_0006527, EFO_0004318	random effects meta-analysis	Table S2	DNA methylation	Smoking	...	chr6:30720080	6	30720080	-	Open sea	-0.08558	0.00159	0.000000	NaN	31536415_Sikdar-S_smoking_random_effects_meta-...
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
40124	Domingo-Relloso A	Strong Heart Study	32484362	2020-06-02	Smoking	EFO_0004318, EFO_0005671, EFO_0006527	Former vs never	Table S3	DNA methylation	Smoking	...	chr6:32118295	6	32118295	PRRT1	Island	-0.05100	0.01310	0.000099	NaN	32484362_Domingo-Relloso-A_smoking_former_vs_n...
40125	Domingo-Relloso A	Strong Heart Study	32484362	2020-06-02	Smoking	EFO_0004318, EFO_0005671, EFO_0006527	Former vs never	Table S3	DNA methylation	Smoking	...	chr1:75599645	1	75599645	LHX8	North shore	-0.07300	0.01880	0.000099	NaN	32484362_Domingo-Relloso-A_smoking_former_vs_n...
40126	Domingo-Relloso A	Strong Heart Study	32484362	2020-06-02	Smoking	EFO_0004318, EFO_0005671, EFO_0006526	Pack years	Table S4	DNA methylation	Smoking	...	chr6:25992047	6	25992047	-	North shore	0.00268	0.00070	0.000099	NaN	32484362_Domingo-Relloso-A_smoking_pack_years
40127	Domingo-Relloso A	Strong Heart Study	32484362	2020-06-02	Smoking	EFO_0004318, EFO_0005671, EFO_0006526	Pack years	Table S4	DNA methylation	Smoking	...	chr10:33421027	10	33421027	-	Open sea	0.00112	0.00030	0.000099	NaN	32484362_Domingo-Relloso-A_smoking_pack_years
40128	Domingo-Relloso A	Strong Heart Study	32484362	2020-06-02	Smoking	EFO_0004318, EFO_0005671, EFO_0006526	Pack years	Table S4	DNA methylation	Smoking	...	chr15:41895799	15	41895799	-	Open sea	0.00109	0.00030	0.000099	NaN	32484362_Domingo-Relloso-A_smoking_pack_years

40129 rows × 31 columns

'''
#################################################
                YOUR CODE HERE
#################################################
1. Identify CpG sites that are commonly annotated in the smoking dataset
2. Filter the DNA methylation data to include only the common CpG sites identified in the previous step
3. Identify patients that are present in both the gene expression dataset and the methylation dataset
4. Filter the methylation data to include only the common patients and common CpG sites
5. Transpose the filtered methylation data matrix
'''

# Step 1: Count the occurrences of each unique value in the 'cpg' column using value_counts
cpg_counts = smoking_df['cpg'].value_counts()

# Step 2: Filter the counts to keep only those greater than 10
filtered_cpg_counts = cpg_counts[cpg_counts > 10]

# Step 3: Get the index of the filtered counts and convert it to a list
common_annotated_cpgs = filtered_cpg_counts.index.tolist()

# Step 4: Identify common CpG sites between the annotated list and the methylation dataset
cpgs = set(common_annotated_cpgs) & set(meth_data.columns)

# Step 5: Convert the cpgs set to a list
cpgs = list(cpgs)

# Step 6: Identify common patients between the gene expression and methylation datasets
# remember how to get the list of patients from dataset and to convert it to a list
common_patients = list(set(GE_data["datMeta"]["patient"].to_list()) & set(meth_data.index))

# Step 7: Filter the methylation data to include only the common patients and common CpG sites
meth_data_filt = meth_data.loc[common_patients, cpgs]

# Step 8: Transpose the filtered methylation data matrix and call it patient_meth_matrix
patient_meth_matrix = meth_data_filt.T

# let's inspect the patient_meth_matrix that we have created
patient_meth_matrix

	TCGA-75-6205	TCGA-97-7941	TCGA-97-7547	TCGA-95-7567	TCGA-95-7944	TCGA-69-7979	TCGA-55-7913	TCGA-55-8206	TCGA-55-6986	TCGA-64-5779	...	TCGA-91-8497	TCGA-MP-A4TC	TCGA-J2-8194	TCGA-75-7030	TCGA-50-5942	TCGA-86-7954	TCGA-78-7145	TCGA-62-A46R	TCGA-L9-A7SV	TCGA-86-8279
cg01731783	0.497444	0.396698	0.396081	0.403062	0.498713	0.411565	0.524197	0.400507	0.417326	0.518443	...	0.446258	0.568588	0.437242	0.452883	0.327272	0.467690	0.311361	0.421168	0.481429	0.429422
cg25949550	0.317716	0.140767	0.200784	0.483062	0.149553	0.259965	0.110951	0.144628	0.415628	0.165865	...	0.200756	0.206802	0.227245	0.205266	0.231608	0.159593	0.193704	0.331710	0.147767	0.137870
cg18316974	0.843950	0.579421	0.750254	0.879412	0.626643	0.192301	0.651054	0.810144	0.696567	0.545015	...	0.497423	0.680952	0.433783	0.515681	0.626869	0.515729	0.903988	0.860906	0.725199	0.371918
cg17372101	0.391085	0.602731	0.729051	0.349166	0.423132	0.378651	0.326185	0.575440	0.607495	0.251461	...	0.571380	0.254356	0.269948	0.499414	0.516246	0.357175	0.171478	0.269681	0.134351	0.192490
cg18146737	0.650549	0.442971	0.438343	0.735122	0.543430	0.184128	0.498125	0.386880	0.554262	0.494514	...	0.407518	0.578550	0.345746	0.446022	0.401742	0.427868	0.415197	0.753086	0.364550	0.419654
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
cg12803068	0.820755	0.664510	0.890526	0.899519	0.858862	0.817286	0.699223	0.796747	0.589624	0.692833	...	0.690219	0.766702	0.676125	0.582260	0.755870	0.683585	0.901678	0.796409	0.748843	0.822947
cg01207684	0.508975	0.187581	0.124399	0.299321	0.538336	0.128008	0.331944	0.184965	0.290967	0.246457	...	0.251051	0.352854	0.128596	0.280396	0.163345	0.287115	0.113246	0.338427	0.101977	0.099196
cg25305703	0.750465	0.858270	0.358668	0.315991	0.804947	0.682358	0.732886	0.861646	0.820864	0.488925	...	0.861702	0.685447	0.810510	0.811707	0.907132	0.818094	0.597665	0.818712	0.111031	0.356221
cg14580211	0.852855	0.760703	0.877694	0.837730	0.819975	0.874536	0.639945	0.844139	0.851723	0.789981	...	0.845984	0.648610	0.715354	0.746873	0.822599	0.887925	0.913990	0.834070	0.940591	0.797121
cg18446336	0.659491	0.637922	0.815690	0.795733	0.763106	0.813799	0.723343	0.828749	0.682581	0.751079	...	0.749783	0.687554	0.643351	0.789296	0.756348	0.712804	0.820558	0.760244	0.842471	0.862028

112 rows × 442 columns

We can finish our network following the previous steps using the functions we have created.

# Dictionary to store different correlation matrices
p_meth_correlation_matrices = {}

# Pearson correlation
p_meth_correlation_matrices['pearson'] = patient_meth_matrix.corr(method='pearson')

p_meth_pearson_graph = create_graph_from_correlation(p_meth_correlation_matrices['pearson'], threshold=0.8)
# Clean the graph by removing unconnected nodes
p_meth_pearson_graph_pruned = clean_graph(p_meth_pearson_graph,
                                    degree_threshold=1,
                                    keep_largest_component=True)

visualise_graph(p_meth_pearson_graph_pruned, title='Pearson Correlation Network (Threshold = 0.8)')

../_images/c0e9e45581820553ddf60cf99aea9a07456baa5729043b59cfe8e0fd36b0d3a7.png

# sparseify the graph using knn_sparsification or any other method
p_meth_pearson_graph_pruned_knn = knn_sparsification(p_meth_pearson_graph_pruned, k=10)

# visualise the graph using the visualise_graph function
visualise_graph(p_meth_pearson_graph_pruned_knn, title='Pearson Correlation Network (Threshold = 0.8)')

../_images/6995f307cd4e61ac636e7b310ee276522b9eb22b6d0895b471c3f082a9115063.png