-
Notifications
You must be signed in to change notification settings - Fork 1
/
PPI_test.py
244 lines (195 loc) · 10.9 KB
/
PPI_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
from src.vis_tools import *
from src.report import *
from src.util import (basic_net_stats,
get_graph_hubs,
generate_path,
get_conversion_map)
import copy
from collections import Counter
from bioservices import UniProt
file_path = "Dataset\PathLinker_2018_human-ppi-weighted-cap0_75.txt"
df = pd.read_csv(file_path, sep='\t')
df = df[:5000] # taking a portion of the graph
print("------------------------------Data Inspection and Visualization---------------------------------------")
# checking for any NaN values in each row and column
print("Check for any missing data: ", df.isna().any().any()) # return false
# checking the range of probabilities in edge_weight column
print(f"The minimum interaction probability {min(df['edge_weight'])}") # 0
print(f"The maximum interaction probability {max(df['edge_weight'])}") # 0.75
# first required point
PPI_graph = visualize_graph(df) # custom function for visualization
basic_net_stats(PPI_graph) # For the whole graph
""" number of nodes 17168
number of edges 612516
average degree 71.36 """
# return the most important 20 hubs in the PPIN
hubs = get_graph_hubs(PPI_graph, 20)
print(f"The highest proteins in degree (hubs) {hubs}")
# Strongly connected component in the PPIN for Diameter computation.
components_PPI = np.array([PPI_graph.subgraph(
c) for c in nx.strongly_connected_components(PPI_graph)], dtype=object)
sorted_indices = np.argsort([len(comp) for comp in components_PPI])[::-1]
max_comp_graph = components_PPI[sorted_indices]
print(f"The 1st largest connected Component: {max_comp_graph[0]}")
print(f"The 2nd largest connected Component: {max_comp_graph[1]}")
# for the whole graph it's equal to 8
graph_diameter = nx.diameter(max_comp_graph[0])
print(f"The maximum shortest path in the PPIN of {graph_diameter} Steps")
# transitivity of the interactome as directed as it is.
# transitivity_value = nx.transitivity(PPI_graph) # computational exhaustive, equal to 0.06446 for whole graph
# Transitivity of the interactome as undirected compared to random graph (Transitivity is a measure of modularity as a global measure)
PPI_graph_undirected = copy.deepcopy(PPI_graph).to_undirected()
original_transitivity = nx.transitivity(PPI_graph_undirected)
print("Original PPIN transitivity Coefficient (no direction considered):",
original_transitivity) # 0.0679 for whole graph
random_graph = nx.gnm_random_graph(
PPI_graph.number_of_nodes(), PPI_graph.number_of_edges())
random_transitivity = nx.transitivity(random_graph)
print("Random Graph transitivity Coefficient (no direction considered):",
random_transitivity) # 0.00414 for whole graph
# Clustering coefficient (local measure)
original_clustering_coefficient = nx.average_clustering(PPI_graph_undirected)
random_clustering_coefficient = nx.average_clustering(random_graph)
print("Original PPIN Clustering Coefficient (no direction considered):",
original_clustering_coefficient) # 0.135 # for whole graph
print("Random Graph Clustering Coefficient (no direction considered):",
random_clustering_coefficient) # 0.0021 for whole graph
print("------------------------------k-Cyclic Shortest Paths---------------------------------------")
# testing example for acyclic shortest path computation (taken from lecture's slides)
G = nx.DiGraph()
G.add_edges_from([(1, 2, {'weight': 0.1}), (2, 3, {'weight': 0.2}), (3, 4, {
'weight': 0.3}), (1, 5, {'weight': 0.3}), (5, 4, {'weight': 0.5})])
# ,(2, 3, {'weight': 1})
# Specify the layout for better visualization
pos = nx.circular_layout(G)
# Draw the graph
# nx.draw(G, pos, with_labels=True, node_size=700, node_color='skyblue', font_size=10, font_color='black', font_weight='bold')
# Draw edge labels
edge_labels = nx.get_edge_attributes(G, 'weight')
# nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels);
file_path = "Exported_files/acyclic_shortest_paths_test1.pdf"
find_acyclic_K_shortest_paths(G, 1, 4, file_path)
# generate a path and get the shortest path
source, target = generate_path(PPI_graph)
print("\nThe source and target proteins generated randomly in order", source, target)
file_path = "Exported_files/acyclic_shortest_paths_test2.pdf"
find_acyclic_K_shortest_paths(PPI_graph, source, target, file_path, 2)
# these source and target proteins have been deduced from generate_path function.
source = "Q00535"
target = "P14373"
file_path = "Exported_files/acyclic_shortest_paths_test3.pdf"
find_acyclic_K_shortest_paths(PPI_graph, source, target, file_path, 2)
# the actual path: ['Q00535', 'Q9NXR1', 'P14373']
# no feedback loop between the first two nodes
print("checking if there is a feedback loop between two nodes in a path")
print(
f'check if there is an edge between ("Q00535", "Q9NXR1") {PPI_graph.has_edge("Q00535", "Q9NXR1")}')
print(
f'check if there a path back from ("Q9NXR1", "Q00535") {nx.has_path(PPI_graph, "Q9NXR1", "Q00535")}')
# list paths' (source & target) that are acyclic all of them of length 3 (small world effect)
path_list = [
("Q9NXR8", "Q92793"), ("Q00535", "P14373"), ("Q00535",
"Q9BRK4"), ("Q92831", "A8MW92"), ("Q92831", "Q9UBN7"),
("Q9NXR8", "Q9Y618"), ("Q9NXR8", "Q9UKV0"), ("Q9NXR8",
"O14744"), ("Q92831", "Q9BY41"),
("Q92831", "P28749"), ("Q9NXR8", "P02751"), ("Q92831",
"Q15573"), ("Q9NXR8", "O75376"),
("Q9NXR8", "Q96NS5"), ("Q9NXR8", "P00533"), ("Q00535",
"Q08379"), ("Q92831", "Q93034"),
("Q9NXR8", "P10070"), ("Q9NXR8", "Q9BW71"), ("Q9NXR8",
"Q04206"), ("Q92831", "Q8IZL8"),
("Q9NXR8", "Q8WTS6"), ("Q92831", "P11509"), ("Q9NXR8",
"P33993"), ("Q9NXR8", "Q14997"),
("Q9NXR8", "Q9UBU9"), ("Q9NXR8", "Q14566"), ("Q92831",
"Q6PL18"), ("Q9NXR8", "O14929"),
("Q92831", "Q14134"), ("Q92831", "A8MW92")]
# candidates for hubs ("Q9NXR8", "Q00535", "Q92831"), most repeated in acyclic paths, they act as source
# targets are unique
# a question is raised about these candidates. lists report about its degree
betCent = nx.betweenness_centrality(PPI_graph, normalized=True, endpoints=True)
print("\nsource nodes")
print(f"Degree of Q9NXR8: {PPI_graph.degree('Q9NXR8')}",
f"Betweenness Centrality of Q92831: {betCent['Q92831']}")
print(f"Degree of Q00535: {PPI_graph.degree('Q00535')}",
f"Betweenness Centrality of Q00535: {betCent['Q00535']}")
print(f"Degree of Q92831: {PPI_graph.degree('Q92831')}",
f"Betweenness Centrality of Q92831: {betCent['Q92831']}")
print("target nodes")
print(f"Degree of Q9BY41: {PPI_graph.degree('Q9BY41')}",
f"Betweenness Centrality of Q9BY41: {betCent['Q9BY41']}")
print(f"Degree of Q8IZL8: {PPI_graph.degree('Q8IZL8')}",
f"Betweenness Centrality of Q8IZL8: {betCent['Q8IZL8']}")
print(f"Degree of P00533: {PPI_graph.degree('P00533')}",
f"Betweenness Centrality of P00533: {betCent['P00533']}")
# let's examine the paths that are most rejected, and see the nature of the source nodes. The majority are 4 in length.
most_reject_paths = [("P05388", "O60216"), ("Q15572", "Q9Y383"), ("Q92833", "P31327"),
("P20933", "P40692"), ("P02765", "Q11203"), ("Q9H840",
"O14979"), ("P02766", "P49619"),
("P02766", "P00734"), ("P00734", "P62753"), ("P40429",
"Q14103"), ("P02768", "Q01850"),
("P02766", "Q9Y3A3"), ("P05387", "Q8NHV4"), ("Q9UBM7",
"P36406"), ("Q9H840", "P49757"),
("Q9UBM7", "Q9H1K0"), ("P05387", "Q9NPI1"), ("P05387",
"P56134"), ("P02765", "O75496"),
("P05388", "Q9NZI8"), ("P20933", "O60925"), ("P39023",
"Q8IXZ2"), ("P02760", "P23396"),
("Q9UBM7", "Q9UKR5"), ("Q15572", "Q9UHJ3")]
# often the longer paths always have a feedback at its beginning
print("\nsource nodes")
print(f"Degree of Q9UBM7: {PPI_graph.degree('Q9UBM7')}",
f"Betweenness Centrality of Q9UBM7: {betCent['Q9UBM7']}")
print(f"Degree of P02766: {PPI_graph.degree('P02766')}",
f"Betweenness Centrality of P02766: {betCent['P02766']}")
print(f"Degree of P05387: {PPI_graph.degree('P05387')}",
f"Betweenness Centrality of P05387: {betCent['P05387']}")
print("target nodes")
print(f"Degree of P62753: {PPI_graph.degree('P62753')}",
f"Betweenness Centrality of P62753: {betCent['P62753']}")
print(f"Degree of O75496: {PPI_graph.degree('O75496')}",
f"Betweenness Centrality of O75496: {betCent['O75496']}")
print(f"Degree of Q9UHJ3: {PPI_graph.degree('Q9UHJ3')}",
f"Betweenness Centrality of Q9UHJ3: {betCent['Q9UHJ3']}")
print("------------------------------Degree Analysis---------------------------------------")
# third point
tail_counter = Counter(df["#tail"])
head_counter = Counter(df["head"])
most_tail_repeated = tail_counter.most_common(1)[0]
most_head_repeated = head_counter.most_common(1)[0]
print(most_tail_repeated, "the most repeated head")
print(most_head_repeated, "the most repeated tail")
export_protein_data(
PPI_graph, most_head_repeated[0], "Exported_files/protein_data.txt")
print("------------------------------Degree Distribution (free-scale property)---------------------------------------")
# fourth point
protein_set = list(PPI_graph.nodes())
draw_degree_histogram(PPI_graph, protein_set,
"Exported_files/degree_report.txt", flag="all")
print("------------------------------Conversion Map---------------------------------------")
# fifth point
uniprot_ids = list(PPI_graph.nodes())[:5]
conversion_map = get_conversion_map(uniprot_ids)
print(f"Sample of conversion map{conversion_map}")
print("debugging efforts")
print(f"Number of UniProt IDs from graph: {len(uniprot_ids)}")
print(f"Number of UniProt IDs in conversion map: {len(conversion_map)}")
missing_data = list()
for protein in uniprot_ids:
if not protein in conversion_map:
missing_data.append(protein)
print(f"Missing data list: {missing_data}")
# this protein does not have a name.
print("the genes name of protein 'Q9Y2S0'")
curr_protein = 'Q9Y2S0'
u = UniProt()
entries = u.get_df(curr_protein)
print(list(entries["Gene Names"])) # the gene name is not known
# sample of Uniport output
print("the genes name of protein 'Q8NI27'")
curr_protein = 'Q8NI27'
u = UniProt()
entries = u.get_df(curr_protein)
print(list(entries["Gene Names"])) # the gene name is known
# sixth point
file_name = "Exported_files/unweighted_graph.csv"
unweighted_graph = saving_as_unweighed_graph_as_adjMatrix(PPI_graph, file_name)
print("adjacency matrix has been generated")