Source code for manamodeller.results_analysis

import glob
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as hc 
import pandas as pd
import numpy as np
import networkx as nx
from pyvis import network as net

from .modelling import fullname_equation


[docs]
def get_node_list(gml_file):
	"""get_node_list.

	Parameters
	----------
	gml_file : str
		the path to the gml file

	Returns
	-------
		a list containing nodes ids from the gml file

	"""
	with open(gml_file,'r') as gml_handler:
		parsed_gml = nx.parse_gml(gml_handler)
	return(list(parsed_gml.nodes))



[docs]
def visualize_gml(gml_file,window_size=['1000px','1000px'],notebook=True):
	"""visualize_gml.

	Parameters
	----------
	gml_file : str
		the path to the gml file
	window_size : list
		the list of x and y pixel sizes for the output window
	notebook : boolean
		if true enable the pyvis optimized visualisation for notebooks

	Returns
	-------
		an interactive visualisation window of the graph

	"""
	g = net.Network(window_size[0],window_size[1],notebook=notebook)
	nxg = nx.readwrite.gml.read_gml(gml_file)
	g.from_nx(nxg)
	return g.show(gml_file.replace('.gml','.html'))




[docs]
def dendro_reactions(matrix,title = "No title set"):
	"""dendro_reactions.

	Parameters
	----------
	matrix : pandas dataframe
		a distance matrix
	title : str
		the title of the dendrogram

	Returns
	-------
		a scipy linkage object and show plot the dendrogram

	"""
	plt.figure(figsize=(20, 7))
	plt.title(title)
	# Create dendrogram
	linkage = hc.linkage(matrix, method='ward')
	hc.dendrogram(linkage,labels=list(matrix.index))

	plt.xlabel('Reaction ID')
	plt.ylabel('Euclidean distance')
	plt.show()
	return linkage



[docs]
def extract_reactions_from_clusters(matrix,title,write_files=False,file_prefix='cluster',header=True):
	"""extract_reactions_from_clusters.

	Parameters
	----------
	matrix : pandas dataframe
		a distance matrix
	title : str
		the title of dendrograms plots
	write_files : boolean
		if true, write cluster's reaction files
	file_prefix : str
		prefix to add when saving cluster's reaction file
	header : boolean
		if true, add a header to the cluster's reaction file

	Returns
	-------
		Write a csv or a pickle with for categorized reactions activity

	"""
	#Show dendro
	plt.figure(figsize=(20, 7))
	plt.title(title)
	colors = []
	#populate the colors_vector with black hexa code
	[colors.append("#000000") for x in range(0,matrix.shape[0]*matrix.shape[1])]
	# Create dendrogram
	linkage = hc.linkage(matrix, method='ward')
	dendro = hc.dendrogram(linkage,labels=list(matrix.index),link_color_func=lambda k: colors[k])

	plt.xlabel('Reaction ID')
	plt.ylabel('Euclidean distance')
	plt.show()
	#ask user for the number of clusters:
	nclusters = int(input("Enter the number of clusters to extract"))
	cutree = hc.cut_tree(linkage,n_clusters=nclusters).T
	labels = list(matrix.index)
	print(cutree)
	#for each cluster return reaction ids in a dict
	#init dict
	clusters_dict = {}
	for i in range(0,nclusters):
		#for each cluster, define
		print([labels[x] for x in np.where(cutree == i)[1]])
		clusters_dict["cluster"+str(i)] = [labels[x] for x in np.where(cutree == i)[1]]
	if write_files:
		j=1
		for key in clusters_dict.keys():
			with open(file_prefix+str(j)+".tsv",'w') as w_hdler:
				if header:
					w_hdler.write("Reaction ID \n")
				for elem in clusters_dict[key]:
					w_hdler.write(elem.strip('\t')+'\n')
			j=j+1
	return cutree



[docs]
def generate_annotation_table(cluster_file,model,hgnc_data,DARs_direction,outputFile):
	"""generate_annotation_table.

	Parameters
	----------
	cluster_file : str
		path to the tsv file containg current cluster's DARs
	model : cobra model
		A cobra object, loaded with the cobra library
	hgnc_data : pandas dataframe
		a hgnc database annotation dataframe
	DARs_direction : str
		the path to the tsv file for the molecule associated with current cluster to get DARs' direction
	outputFile : str
		path and name of the outputFile (xlsx file)
	Returns
	-------
		Write an excel file with one reaction per line and corresponding annotations in columns

	"""
	comp = {"c":"Cytoplasm","m":"Mitochondrion","x":"Peroxisome","l":"Lysosome","g":"Golgi appartus",\
	 "e":"Extracellular space","r":"Endoplasic reticulum","n":"Nucleus","i":"Mitochondrial intermembrane space"}
	dars_dir_test = pd.read_csv(DARs_direction,sep='\t',index_col=0,header=None)
	annot_df = pd.read_csv(cluster_file,sep='\t')
	#generate columns
	annot_df["Reaction name"] = np.nan
	annot_df["Direction"] = np.nan
	annot_df["Equation"] = np.nan
	annot_df["Associated genes"] = np.nan
	annot_df["Associated genes"] = annot_df["Associated genes"].astype(object)
	annot_df["Pathway in model"] = np.nan
	annot_df["Localisation"] = np.nan
	annot_df["Associated genes links"] = ""
	annot_df["Associated genes links"] = annot_df["Associated genes links"].astype(str)
	for reaction in annot_df.iloc[:,0]:
		rid = reaction.replace('R_','')
		r = model.reactions.get_by_id(reaction.replace('R_',''))
		annot_df.loc[annot_df.iloc[:,0] == reaction,"Reaction name"] = r.name
		annot_df.loc[annot_df.iloc[:,0] == reaction,"Direction"] = dars_dir_test.loc[reaction,1]
		annot_df.loc[annot_df.iloc[:,0] == reaction,"Equation"] = fullname_equation(r)
		#convert HGNC IDs to Hugo symbols
		list_genes = []
		for elem in r.genes:
			list_genes.append(hgnc_data.loc[hgnc_data["HGNC ID"] == elem.id,'Approved symbol'].to_list()[0])
		if((len(list_genes) == 0) & (len(r.genes) != 0)):
			list_genes = r.genes
		annot_df.at[annot_df.loc[annot_df.iloc[:,0] == reaction,"Associated genes"].index[0],"Associated genes"] = list_genes
		#create genecards url with approved symbols
		list_urls = []
		for gene in list_genes:
			list_urls.append("https://www.genecards.org/cgi-bin/carddisp.pl?gene="+str(gene))
		if(len(list_urls) == 0):
			annot_df.at[annot_df.loc[annot_df.iloc[:,0] == reaction,"Associated genes links"].index[0],"Associated genes links"] = ""
		elif(len(list_urls) == 1):
			annot_df.at[annot_df.loc[annot_df.iloc[:,0] == reaction,"Associated genes links"].index[0],"Associated genes links"] = list_urls[0]
		else:
			for i in range(0,len(list_urls)):
				if(8+i> annot_df.shape[1]): #8 is the default number of columns
					annot_df["Associated genes links"+str(i)] = ""
					annot_df["Associated genes links"+str(i)] = annot_df["Associated genes links"+str(i)].astype(str)
				if(i == 0):
					annot_df.at[annot_df.loc[annot_df.iloc[:,0] == reaction,"Associated genes links"].index[0],"Associated genes links"] \
						= annot_df.at[annot_df.loc[annot_df.iloc[:,0] == reaction,"Associated genes links"].index[0],"Associated genes links"] + \
						list_urls[i]
				else:
					annot_df.at[annot_df.loc[annot_df.iloc[:,0] == reaction,"Associated genes links"+str(i)].index[0],"Associated genes links"+str(i)]\
						  = annot_df.at[annot_df.loc[annot_df.iloc[:,0] == reaction,"Associated genes links"+str(i)].index[0],"Associated genes links"+str(i)] + \
						list_urls[i]
		if "array([]," in r.subsystem: #if no subsystem, leave cell empty
			annot_df.loc[annot_df.iloc[:,0] == reaction,"Pathway in model"] = ""
		else:
			annot_df.loc[annot_df.iloc[:,0] == reaction,"Pathway in model"] = r.subsystem.split("'")[1]
		#get compartment, if more than one compartment annotated as transport
		comps = set()
		compsR = set()
		compsP = set()
		for elem in r.reactants:
			comps.add(elem.compartment)
			compsR.add(elem.compartment)
		for elem in r.products:
			comps.add(elem.compartment)
			compsP.add(elem.compartment)
		
		if(len(comps) == 2):
			for elem in annot_df.loc[annot_df.iloc[:,0] == reaction,"Equation"].to_list()[0].split(' '):
				if elem == '-->' or elem == '<--' or elem == '<=>':
					if(len(compsR)>1 and len(compsP)>1):
						#its a co-transport
						annot_df.loc[annot_df.iloc[:,0] == reaction,"Localisation"] = "CoTransport"+'('+comp[list(compsR)[0]]+\
							'/'+comp[list(compsR)[1]]+elem+comp[list(compsP)[0]]+'/'+comp[list(compsP)[1]]+')'
					elif(len(compsR)>1 and len(compsP)==1):
						annot_df.loc[annot_df.iloc[:,0] == reaction,"Localisation"] = "CoTransport"+'('+comp[list(compsR)[0]]+\
							'/'+comp[list(compsR)[1]]+elem+comp[compsP.pop()]+')'
					elif(len(compsR)==1 and len(compsP)>1):
						annot_df.loc[annot_df.iloc[:,0] == reaction,"Localisation"] = "CoTransport"+'('+comp[compsR.pop()]+\
							elem+comp[list(compsP)[0]]+'/'+comp[list(compsP)[1]]+')'
					else:
						annot_df.loc[annot_df.iloc[:,0] == reaction,"Localisation"] = "Transport"+'('+comp[compsR.pop()]+\
							elem+comp[compsP.pop()]+')'
					break
		else:
			annot_df.loc[annot_df.iloc[:,0] == reaction,"Localisation"] = comp[comps.pop()]
	#Excel writer (jmcnamara on stackoverflow)
	writer = pd.ExcelWriter(outputFile, engine='xlsxwriter')
	# Convert the dataframe to an XlsxWriter Excel object.
	annot_df.to_excel(writer, sheet_name="DAR_enriched_table", index=False)
	# Get the xlsxwriter workbook and worksheet objects.
	workbook  = writer.book
	worksheet = writer.sheets["DAR_enriched_table"]
	# Add a text wrap format.
	text_wrap_format = workbook.add_format({'text_wrap': True})
	# Set columns format.
	worksheet.set_column(0,0,15) #Reaction ID
	worksheet.set_column(1,1,30) #Reaction ID, Reaction name
	worksheet.set_column(2,2,10) #Direction
	worksheet.set_column(3,4,30) #Equation, Associated genes
	worksheet.set_column(5,6,45) #Pathway in model, Localisation
	worksheet.set_column(7, annot_df.shape[1], 60, text_wrap_format) # genes columns
	#add an autofilter
	worksheet.autofilter(0, 0, annot_df.shape[0], annot_df.shape[1] - 1)
	# Close the Pandas Excel writer and output the Excel file.
	writer.save()