Source code for ddot.utils

from __future__ import absolute_import, print_function

import sys
import base64
import time
import traceback
import os
import io
from math import ceil
from datetime import datetime

import pandas as pd
import networkx as nx
import numpy as np

import ndex.client as nc
from ndex.networkn import NdexGraph
    
import ddot
import ddot.config

def print_time(*s):
    print(' '.join(map(str, s)), datetime.today())
    sys.stdout.flush()

def to_hiview_url(ndex_url, hiview_server='http://hiview.ucsd.edu'):
    if hiview_server=='test':
        hiview_server = 'http://hiview-test.ucsd.edu'

    ndex_uuid = parse_ndex_uuid(ndex_url)
    ndex_server = parse_ndex_server(ndex_url)
    ndex_server = ndex_server.rstrip('/')
    if ndex_server=='http://test.ndexbio.org' or ndex_server=='http://dev2.ndexbio.org':
        server_type = 'test'
    elif ndex_server=='http://public.ndexbio.org' or ndex_server=='http://ndexbio.org':
        server_type = 'public'
    else:
        raise Exception('Invalid NDEx server URL: %s' % ndex_server)
    
    hiview_url = "%s/%s?type=%s&server=%s" % (hiview_server, ndex_uuid, server_type, ndex_server)
    return hiview_url

[docs]def invert_dict(dic, sort=True, keymap={}, valmap={}): """Inverts a dictionary of the form key1 : [val1, val2] key2 : [val1] to a dictionary of the form val1 : [key1, key2] val2 : [key2] Parameters ----------- dic : dict Returns ----------- dict """ dic_inv = {} for k, v_list in dic.items(): k = keymap.get(k, k) for v in v_list: v = valmap.get(v, v) if v in dic_inv: dic_inv[v].append(k) else: dic_inv[v] = [k] if sort: for k in dic_inv.keys(): dic_inv[k].sort() return dic_inv
[docs]def transform_pos(pos, xmin=-250, xmax=250, ymin=-250, ymax=250): """Transforms coordinates to fit a bounding box. Parameters ----------- pos : dict Dictionary mapping node names to (x,y) coordinates xmin : float, optional Minimum x-coordinate of the bounding box xmax : float, optional Maximum x-coordinate of the bounding box ymin : float, optional Minimum y-coordinate of the bounding box ymax : float, optional Maximum y-coordinate of the bounding box Returns ------- dict: New dictionary with transformed coordinates """ def make_transform(x_list): xmin_old, xmax_old = min(x_list), max(x_list) midpoint_old = (xmin_old + xmax_old) / 2. midpoint_new = (xmin + xmax) / 2. scale = float(xmax - xmin) / (xmax_old - xmin_old) def transform_x(x): return (x - midpoint_old) * scale + midpoint_new return transform_x transform_x = make_transform([x for x, y in pos.values()]) transform_y = make_transform([y for x, y in pos.values()]) pos = {g : (transform_x(x), transform_y(y)) for g, (x, y) in pos.items()} return pos
[docs]def bubble_layout_nx(G, xmin=-750, xmax=750, ymin=-750, ymax=750, verbose=False): """Bubble-tree Layout using the Tulip library. The input tree must be a graph. The layout is scaled so that it is fit exactly within a bounding box. Grivet, S., Auber, D., Domenger, J. P., & Melancon, G. (2006). Bubble tree drawing algorithm. In Computer Vision and Graphics (pp. 633-641). Springer Netherlands. Parameters ---------- G : networkx.Graph Tree xmin : float, optional Minimum x-coordinate of the bounding box xmax : float, optional Maximum x-coordinate of the bounding box ymin : float, optional Minimum y-coordinate of the bounding box ymax : float, optional Maximum y-coordinate of the bounding box Returns ------- dict Dictionary mapping nodes to 2D coordinates. pos[node_name] -> (x,y) """ from tulip import tlp # from tulip import * # from tulipgui import * graph = tlp.newGraph() nodes = graph.addNodes(len(G.nodes())) nodes_idx = make_index(G.nodes()) for x, y in G.edges(): graph.addEdge(nodes[nodes_idx[x]], nodes[nodes_idx[y]]) # Apply the 'Bubble Tree' graph layout plugin from Tulip graph.applyLayoutAlgorithm('Bubble Tree') viewLayout = graph.getLayoutProperty("viewLayout") pos = {g : (viewLayout[i].x(), viewLayout[i].y()) for i, g in zip(nodes, G.nodes())} pos = transform_pos(pos, xmin=xmin, xmax=xmax, ymin=ymin, ymax=ymax) return pos
def split_indices(n, k): try: tmp = iter(n) indices = n except TypeError: assert type(n)==type(int(1)) indices = list(range(n)) chunk_size = int(ceil(float(len(indices)) / k)) return [(chunk_size * a, min(chunk_size * (a+1), len(indices))) for a in range(int(ceil(float(len(indices)) / chunk_size)))] def split_indices_chunk(n, k): try: iter(n) n = len(n) except TypeError: assert isinstance(n, int) return [(k*i, min(k*(i+1), n)) for i in range(int(ceil(float(n) / k)))]
[docs]def make_index(it): """Create a dictionary mapping elements of an iterable to the index position of that element """ return {b : a for a, b in enumerate(it)}
def time_print(*s): print(' '.join(map(str, s)), datetime.today()) sys.stdout.flush()
[docs]def pivot_square(df, index, columns, values, fill_value=0): """Convert a dataframe into a square compact representation. Parameters ---------- df : pandas.DataFrame DataFrame in long-format where every row represents one gene pair Returns ------- df : pandas.DataFrame DataFrame with gene-by-gene dimensions """ df = df.pivot(index=index, columns=columns, values=values) index = df.index.union(df.columns) df = df.reindex(index=index, columns=index, copy=False) # Make square matrix symmetric tmp = df.values.copy() tmp[np.isnan(tmp)] = np.inf tmp = np.minimum(tmp, tmp.T) tmp[np.isinf(tmp)] = fill_value df.iloc[:,:] = tmp return df
[docs]def melt_square(df, columns=['Gene1', 'Gene2'], similarity='similarity', empty_value=0, upper_triangle=True): """Melts square dataframe into sparse representation. Parameters ---------- df : pandas.DataFrame Square-shaped dataframe where df[i,j] is the value of edge (i,j) columns : iterable Column names for nodes in the output dataframe similarity : string Column for edge value in the output dataframe empty_value Not yet supported upper_triangle : bool Only use the values in the upper-right triangle (including the diagonal) of the input square dataframe Returns ------- pandas.DataFrame 3-column dataframe that provides a sparse representation of the edges. Two of the columns indicate the node name, and the third column indicates the edge value """ assert df.shape[0] == df.shape[1] if upper_triangle: assert np.all(df.index.values == df.columns.values) df = df.copy() tmp = df.values tmp[np.tril_indices(tmp.shape[0], k=0)] = np.nan df.iloc[:,:] = tmp tmp = df.stack() tmp.dropna(inplace=True) tmp.index.rename(columns, inplace=True) tmp.rename(similarity, inplace=True) return tmp.reset_index()
# def get_gene_name_converter(genes, scopes='symbol', fields='entrezgene', species='human', target='gene'): # """Query mygene.info to get a dictionary mapping gene names in the ID # namespace scopes to the ID namespace in fields # Weird behavior with mygene.info: for Entrez genes, use fields # 'entrezgene'. For ENSEMBL genes, use fields "ensembl" # """ # if hasattr(genes, '__iter__') and not isinstance(genes, (str, unicode)): # genes = ','.join(genes) # import requests # r = requests.post('http://mygene.info/v3/query', # data={'q': genes, # 'scopes': scopes, # 'fields': fields, # 'species': species}) # def parse_field(x): # if isinstance(x, dict): # return [unicode(x[target])] # elif isinstance(x, list): # return [unicode(y[target]) for y in x] # else: # return unicode(x) # dic = {x['query'] : parse_field(x[fields]) for x in r.json() if x.has_key(fields)} # return dic
[docs]def update_nx_with_alignment(G, alignment, term_descriptions=None, use_node_name=True): """Add node attributes to a NetworkX graph. Parameters ---------- G NetworkX object alignment pandas.DataFrame where the index is the name of terms, and where there are 3 columns: 'Term', 'Similarity', 'FDR' use_node_name : bool term_descriptions : dict Returns ------- None """ alignment = alignment.copy() alignment['FDR'] = alignment['FDR'].astype(str) alignment['Similarity'] = alignment['Similarity'].astype(str) for node_idx, node_attr in G.nodes(data=True): if use_node_name: node_name = node_attr['name'] else: node_name = node_idx if node_attr['Gene_or_Term']=='Term' and node_name in alignment.index: row = alignment.loc[node_name, :] if term_descriptions is not None: descr = term_descriptions[row['Term']] node_attr['Label'] = '%s\n%s' % (node_name, descr) node_attr['Aligned_Term_Description'] = descr node_attr['represents'] = row['Term'] node_attr['Aligned_Term'] = row['Term'] node_attr['Aligned_Similarity'] = row['Similarity'] node_attr['Aligned_FDR'] = row['FDR'] return G
################################################### # NetworkX, NdexGraph, and NDEx format converters # ###################################################
[docs]def set_node_attributes_from_pandas(G, node_attr): """Modify node attributes according to a pandas.DataFrame. Parameters ---------- G : networkx.Graph node_attr : pandas.DataFrame """ G_nodes = set(G.nodes()) node_attr = node_attr.loc[[x for x in node_attr.index if x in G_nodes], :] if node_attr is not None: for feature_name, feature in node_attr.iteritems(): for n, v in feature.dropna().iteritems(): try: # If v is actually a NumPy scalar type, # e.g. np.float or np.int, then convert it to a # fundamental Python type, e.g. int or float v = v.item() except: pass G.node[n][feature_name] = v
[docs]def set_edge_attributes_from_pandas(G, edge_attr): """Modify edge attributes according to a pandas.DataFrame. Parameters ---------- G : networkx.Graph edge_attr : pandas.DataFrame """ G_edges = set(G.edges()) edge_attr = edge_attr.loc[[x for x in edge_attr.index if x in G_edges], :] if edge_attr is not None: for feature_name, feature in edge_attr.iteritems(): for (e1,e2), v in feature.dropna().iteritems(): try: v = v.item() except: pass G[e1][e2][feature_name] = v
# G.edge[(e1,e2)][feature_name] = v
[docs]def nx_nodes_to_pandas(G, attr_list=None): """Create pandas.DataFrame of node attributes of a NetworkX graph. Parameters ---------- G : networkx.Graph attr_list : list, optional Names of node attributes. Default: all node attributes Returns ------- pandas.DataFrame DataFrame where index is the names of nodes and the columns are node attributes. """ if attr_list is None: attr_list = list(set([a for d in G.nodes(data=True) for a in d[1].keys()])) try: # Used for pandas version >= 0.23 return pd.concat([pd.Series(nx.get_node_attributes(G,a), name=a) for a in attr_list], axis=1, sort=True) except: return pd.concat([pd.Series(nx.get_node_attributes(G,a), name=a) for a in attr_list], axis=1)
[docs]def nx_edges_to_pandas(G, attr_list=None): """Create pandas.DataFrame of edge attributes of a NetworkX graph. Parameters ---------- G : networkx.Graph attr_list : list, optional Names of edge attributes. Default: all edge attributes Returns ------- pandas.DataFrame DataFrame where index is a MultIndex with two levels (u,v) referring to edges and the columns refer to edge attributes. For multi(di)graphs, the MultiIndex have three levels of the form (u, v, key). """ if attr_list is None: attr_list = list(set([a for d in G.edges(data=True) for a in d[2].keys()])) # print 'attr_list:', attr_list # for a in attr_list: # print 'attr:', a # nx.get_edge_attributes(G,a) if len(attr_list) > 0: df = pd.concat([pd.Series(nx.get_edge_attributes(G,a), name=a) for a in attr_list], axis=1) else: df = pd.DataFrame(index=pd.MultiIndex.from_tuples(G.edges())) if df.index.nlevels==2: df.index.rename(['Node1', 'Node2'], inplace=True) elif df.index.nlevels==3: df.index.rename(['Node1', 'Node2', 'EdgeID'], inplace=True) else: raise Exception('Invalid number of levels: %s' % df.index.nlevels) return df
[docs]def ig_nodes_to_pandas(G, attr_list=None): """Create pandas.DataFrame of node attributes of a igraph.Graph object. Parameters ---------- G : igraph.Graph attr_list : list, optional Names of node attributes. Default: all node attributes Returns ------- pandas.DataFrame DataFrame where index is the names of nodes and the columns are node attributes. """ if attr_list is None: attr_list = G.vertex_attributes() # Remove the "name" attribute because it will be the DataFrame's index if 'name' in attr_list: attr_list.remove('name') df = pd.DataFrame(index=G.vs['name']) for attr in attr_list: df[attr] = G.vs[attr] df.dropna(axis=0, how='all', inplace=True) return df
[docs]def ig_edges_to_pandas(G, attr_list=None): """Create pandas.DataFrame of edge attributes of a igraph Graph object. Parameters ---------- G : igraph.Graph attr_list : list, optional Names of edge attributes. Default: all edge attributes Returns ------- pandas.DataFrame DataFrame where index is a MultIndex with two levels (u,v) referring to edges and the columns refer to edge attributes. """ if attr_list is None: attr_list = G.edge_attributes() edge_list = [(G.vs[e.source]['name'], G.vs[e.target]['name']) for e in G.es] df = pd.DataFrame(index=pd.MultiIndex.from_tuples(edge_list)) df.index.rename(['Node1', 'Node2'], inplace=True) for attr in attr_list: df[attr] = G.es[attr] df.dropna(axis=0, how='all', inplace=True) return df
[docs]def nx_to_NdexGraph(G_nx, discard_null=True): """Converts a NetworkX into a NdexGraph object. Parameters ---------- G_nx : networkx.Graph Returns ------- ndex.networkn.NdexGraph """ G = NdexGraph() node_id = 0 node_dict = {} G.max_edge_id = 0 for node_name, node_attr in G_nx.nodes(data=True): if discard_null: node_attr = {k:v for k,v in node_attr.items() if not pd.isnull(v)} if 'name' in node_attr: #G.add_node(node_id, node_attr) G.add_node(node_id, **node_attr) else: #G.add_node(node_id, node_attr, name=node_name) G.add_node(node_id, name=node_name, **node_attr) node_dict[node_name] = node_id node_id += 1 for s, t, edge_attr in G_nx.edges(data=True): if discard_null: edge_attr = {k:v for k,v in edge_attr.items() if not pd.isnull(v)} G.add_edge(node_dict[s], node_dict[t], G.max_edge_id, edge_attr) G.max_edge_id += 1 if hasattr(G_nx, 'pos'): G.pos = {node_dict[a] : b for a, b in G_nx.pos.items()} # G.subnetwork_id = 1 # G.view_id = 1 return G
[docs]def NdexGraph_to_nx(G): """Converts a NetworkX into a NdexGraph object. Parameters ---------- G : ndex.networkn.NdexGraph Returns ------- networkx.classes.DiGraph """ return nx.DiGraph(nx.relabel_nodes(G, nx.get_node_attributes(G, 'name'), copy=True))
[docs]def parse_ndex_uuid(ndex_url): """Extracts the NDEx UUID from a URL Parameters ---------- ndex_url : str URL for a network stored on NDEx Returns ------- str UUID of the network """ if 'v2/network/' in ndex_url: return ndex_url.split('v2/network/')[1] elif '#/network/' in ndex_url: return ndex_url.split('#/network/')[1] else: raise Exception("Not a valid NDEx URL: %s" % ndex_url)
def parse_ndex_server(ndex_url): if 'v2/network/' in ndex_url: return ndex_url.split('v2/network/')[0] elif '#/network/' in ndex_url: return ndex_url.split('#/network/')[0] else: raise Exception("Not a valid NDEx URL: %s" % ndex_url) # tmp = ndex_url.split('//') # if len(tmp) == 2: # # e.g. 'http://dev2.ndexbio.org/v2/network/8bfa8318-55ed-11e7-a2e2-0660b7976219' # return tmp[0] + '//' + tmp[1].split('v2/network/')[0] # elif len(tmp) == 1: # # e.g. 'dev2.ndexbio.org/v2/network/8bfa8318-55ed-11e7-a2e2-0660b7976219' # return tmp[0].split('v2/network/')[0] # elif len(tmp) == 0 or len(tmp) > 2: # raise Exception()
[docs]def create_edgeMatrix(X, X_cols, X_rows, verbose=True, G=None, ndex2=True): """Converts an NumPy array into a NdexGraph with a special CX aspect called "edge_matrix". The array is serialized using base64 encoding. Parameters ---------- X : np.ndarray X_cols : list Column names X_rows : list Row names Returns ------- ndex.networkn.NdexGraph """ if ndex2: import ndex2 import ndex2.client if not isinstance(X, np.ndarray): raise Exception('Provided matrix is not of type numpy.ndarray') if not isinstance(X_cols, list): raise Exception('Provided column header is not in the correct format. Please provide a list of strings') if not isinstance(X_rows, list): raise Exception('Provided row header is not in the correct format. Please provide a list of strings') if not X.flags['C_CONTIGUOUS']: X = np.ascontiguousarray(X) X_bytes = X.tobytes() chunk_size = int(1e8) # 100MB serialized_list = [{'v': base64.b64encode(X_bytes[s:e])} for i, (s, e) in enumerate(ddot.split_indices_chunk(len(X_bytes), chunk_size))] if verbose: print('Broke up serialization into %s chunks' % len(serialized_list)) if verbose: print('Size of numpy array (MB):', X.nbytes / 1e6) serialize_size = sum([sys.getsizeof(x['v']) for x in serialized_list]) print('Size of serialization (MB):', serialize_size / 1e6) print('Constant factor overhead:', float(serialize_size) / X.nbytes) # serialized = base64.b64encode(X.tobytes()) # if verbose: # print('Size of numpy array (MB):', X.nbytes / 1e6) # print('Size of serialization (MB):', sys.getsizeof(serialized) / 1e6) # print('Constant factor overhead:', float(sys.getsizeof(serialized)) / X.nbytes) # chunk_size = int(1e8) # # chunk_size = int(1e1) # serialized_list = [{'v': serialized[s:e]} for i, (s, e) in enumerate(ddot.split_indices_chunk(len(serialized), chunk_size))] # if verbose: # print('Broke up serialization into %s chunks' % len(serialized_list)) nice_cx_builder = ndex2.NiceCXBuilder() # nice_cx_builder.set_name(name) nice_cx_builder.add_node(name='Matrix', represents='Matrix') # nice_cx_builder.add_opaque_aspect('matrix', [{'v': serialized}]) #nice_cx_builder.add_opaque_aspect('matrix', [serialized_list]) nice_cx_builder.add_opaque_aspect('matrix', serialized_list) nice_cx_builder.add_opaque_aspect('matrix_cols', [{'v': X_cols}]) nice_cx_builder.add_opaque_aspect('matrix_rows', [{'v': X_rows}]) nice_cx_builder.add_opaque_aspect('matrix_dtype', [{'v': X.dtype.name}]) nice_cx = nice_cx_builder.get_nice_cx() return nice_cx else: if not X.flags['C_CONTIGUOUS']: X = np.ascontiguousarray(X) # Use base64 encoding of binary to text. More efficient than # pickle(*, protocol=0) start = time.time() serialized = base64.b64encode(X) base64_time = time.time() - start assert isinstance(X_cols, list) assert isinstance(X_rows, list) if sys.version_info.major==3: start = time.time() serialized = serialized.decode('utf-8') serialize_decode_time = time.time() - start if verbose: print('serialize_decode_time (sec):', serialize_decode_time) if verbose: print('base64 encoding time (sec):', base64_time) print('Size of numpy array (MB):', X.nbytes / 1e6) print('Size of serialization (MB):', sys.getsizeof(serialized) / 1e6) print('Constant factor overhead:', float(sys.getsizeof(serialized)) / X.nbytes) if G is None: G = NdexGraph() # G.unclassified_cx.append( # {'matrix': serialized, # 'matrix_cols' : X_cols, # 'matrix_rows' : X_rows, # 'matrix_dtype' : X.dtype.name}) G.unclassified_cx.append({'matrix': [{'v': serialized}]}) G.unclassified_cx.append({'matrix_cols': [{'v': X_cols}]}) G.unclassified_cx.append({'matrix_rows': [{'v': X_rows}]}) G.unclassified_cx.append({'matrix_dtype': [{'v': X.dtype.name}]}) G.add_new_node('Matrix') return G
[docs]def load_edgeMatrix(ndex_uuid, ndex_server, ndex_user, ndex_pass, ndex=None, json=None, verbose=True): """Loads a NumPy array from a NdexGraph with a special CX aspect called "edge_matrix". Parameters ---------- ndex_uuid : str NDEx UUID of ontology ndex_server : str URL of NDEx server ndex_user : str NDEx username ndex_pass : str NDEx password json : module JSON module with "loads" function. Default: the simplejson package (must be installed) Returns ------- X : np.ndarray X_cols : list Column names X_rows : list Row names """ if json is None: # import ijson # json = ijson import simplejson json = simplejson if ndex is None: ndex = nc.Ndex(ndex_server, ndex_user, ndex_pass) start = time.time() response = ndex.get_network_as_cx_stream(ndex_uuid) response.raw.decode_content = True try: cx = json.load(response.raw) finally: response.close() if verbose: print('NDEx download and CX parse time (sec):', time.time() - start) start_loop = time.time() for aspect in cx: if 'matrix_cols' in aspect: cols = aspect.get('matrix_cols')[0].get('v') if 'matrix_rows' in aspect: rows = aspect.get('matrix_rows')[0].get('v') if 'matrix_dtype' in aspect: dtype = np.dtype(aspect.get('matrix_dtype')[0].get('v')) dim = (len(rows), len(cols)) #X_buf = bytearray(dim[0] * dim[1] * np.dtype(dtype).itemsize) X_buf = [] pointer = 0 if verbose: print('Dim:', dim) # print('Bytes:', len(X_buf)) print('Bytes:', dim[0] * dim[1] * np.dtype(dtype).itemsize) for aspect in cx: if 'matrix' in aspect: for x in aspect.get('matrix'): if sys.version_info.major==3: binary_data = base64.decodebytes(x.get('v').encode('utf-8')) else: binary_data = base64.b64decode(x.get('v')) del x['v'] X_buf.append(binary_data) # X_buf[pointer : pointer + len(binary_data)] = binary_data # pointer += len(binary_data) X_buf = (b"").join(X_buf) # Create a NumPy array X = np.frombuffer(X_buf, dtype=dtype).reshape(dim) if verbose: print('Iterate through CX and construct array time (sec):', time.time() - start_loop) return X, rows, cols
[docs]def sim_matrix_to_NdexGraph(sim, names, similarity, output_fmt, node_attr=None): """Convert similarity matrix into NdexGraph object Parameters ----------- sim : np.ndarray Square-shaped NumPy array representing similarities names : list Genes names, in the same order as the rows and columns of sim similarity : str Edge attribute name for similarities in the resulting NdexGraph object output_fmt : str Either 'cx' (Standard CX format), or 'cx_matrix' (custom edgeMatrix aspect) node_attr : pandas.DataFrame, optional Node attributes, as a pandas.DataFrame, to be set in NdexGraph object Returns --------- ndex.networkn.NdexGraph """ if output_fmt == 'cx_matrix': # G = nx.Digraph() # if node_attr is not None: # set_node_attributes_from_pandas(G, gene_attr) names = list(names) return create_edgeMatrix(sim, names, names, ndex2=False) elif output_fmt == 'cx': # Keep only upper-right triangle sim[np.tril_indices(sim.shape[0],k=0)] = 0 # Create NetworkX graph nnz = sim.nonzero() ebunch = [(a,b,float(c)) for (a,b),c in zip(zip(*nnz), sim[nnz])] G = nx.DiGraph() G.add_weighted_edges_from(ebunch, weight=similarity) nx.relabel_nodes(G, dict(enumerate(names)), copy=False) if node_attr is not None: set_node_attributes_from_pandas(G, node_attr) return nx_to_NdexGraph(G) else: raise Exception('Unsupported output_fmt: %s' % output_fmt)
[docs]def ndex_to_sim_matrix(ndex_url, ndex_server=None, ndex_user=None, ndex_pass=None, similarity=None, input_fmt='cx_matrix', output_fmt='matrix', subset=None, verbose=True): """Read a similarity network from NDEx and return it as either a square np.array (compact representation) or a pandas.DataFrame of the non-zero similarity values (sparse representation) Parameters ---------- ndex_url : str NDEx URL (or UUID) of ontology ndex_server : str URL of NDEx server ndex_user : str NDEx username ndex_pass : str NDEx password similarity : str Name of the edge attribute that represents the similarity/weight between two nodes. If None, then the name of the edge attribute in the output is named 'similarity' and all edges are assumed to have a similarity value of 1. input_fmt : str output_fmt : str If 'matrix', return a NumPy array. If 'sparse', return a pandas.DataFrame subset : optional Returns -------- np.ndarray or pandas.DataFrame """ if ndex_server is None: ndex_server = ddot.config.ndex_server if ndex_user is None: ndex_pass = ddot.config.ndex_user if ndex_pass is None: ndex_pass = ddot.config.ndex_pass if 'http' in ndex_url: ndex_server = parse_ndex_server(ndex_url) ndex_uuid = parse_ndex_uuid(ndex_url) else: ndex_uuid = ndex_url if input_fmt=='cx': # Read graph using NDEx client G = NdexGraph_to_nx( NdexGraph( server=ndex_server, username=ndex_user, password=ndex_pass, uuid=ndex_uuid)) # Create a DataFrame of similarity scores G_df = nx_edges_to_pandas(G) G_df.index.rename(['Node1', 'Node2'], inplace=True) G_df.reset_index(inplace=True) if similarity is None: G_df['similarity'] = 1.0 else: G_df[similarity] = G_df[similarity].astype(np.float64) nodes_attr = nx_nodes_to_pandas(G) if output_fmt=='matrix': G_sq = pivot_square(G_df, 'Node1', 'Node2', similarity) return G_sq.values, G_sq.index.values elif output_fmt=='sparse': return G_df, nodes_attr else: raise Exception('Unsupported output_fmt: %s' % output_fmt) elif input_fmt=='cx_matrix': sim, sim_names, sim_names_col = load_edgeMatrix( ndex_uuid, ndex_server, ndex_user, ndex_pass, verbose=verbose) assert sim_names == sim_names_col if subset is not None: idx = make_index(sim_names) idx = np.array([idx[g] for g in subset if g in idx]) sim = sim[idx, :][:, idx] sim_names = sim_names[idx] if output_fmt=='matrix': return sim, sim_names elif output_fmt=='sparse': G_sq = pd.DataFrame(sim, index=sim_names, columns=sim_names) G_df = melt_square(G_sq) return G_df, None else: raise Exception('Unsupported output_fmt: %s' % output_fmt) else: raise Exception('Unsupported input_fmt: %s' % input_fmt)
[docs]def expand_seed(seed, sim, sim_names, agg='mean', min_sim=-np.inf, filter_perc=None, seed_perc=None, agg_perc=0.5, expand_size=None, include_seed=True, figure=False, verbose=False): """Identify genes that are most similar to a seed set of genes. A gene is included in the expanded set only if it meets all of the specified criteria. If include_seed is True, then genes that are seeds will be included regardless of the criteria. At the same time, the number of genes returned is still limited by expand_size. One way to get n novel genes returned is therefore to set expand_size = n + |seed| and include_seed = True, and then to remove the seed list from expand. Parameters ---------- seed : list sim : np.ndarray sim_names : list of str agg : str or function Aggregation method. Possible values are mean, min, max, perc. min_sim : float Minimum similarity to the seed set. filter_perc : float Filter based on a percentile of similarities between all genes and the seed set. seed_perc : float Filter based on a percentile of similarities between seed set to itself. agg_perc : float The <agg_perc> percentile of similarities to the seed set. For example, if a gene has similarities of (0, 0.2, 0.4, 0.6, 0.8) to five seed genes, then the 10% similarity is 0.2 expand_size : int Maximum limit on the number of returned genes. include_seed : bool Include the seed genes even if they didn't meet the criteria. figure : bool Generate a figure showing the average distances within the seed an d the average distances between seed and the background. Returns ------- expand The list of expanded genes passing all filters. expand_idx Indices of the ranking. I.e. `expand_idx[0]` is the index of the top gene, so you can get the name of the top gene with `sim_names[expand_idx[0]]` where `sim_names` is the input parameter. sim_2_seed The returned array `sim_2_seed` is the calculated similarities of the genes to the seed set. So `sim_2_seed[0]` is the similarity of the gene fig The generated figure. Can be saved like this: plt.savefig('foo.pdf') """ # Check at least one seed assert len(seed) > 0 # Check similarity matrix has correct dimensions assert sim.shape[0] == sim.shape[1] and sim.shape[0] == len(sim_names) sim_names = np.array(sim_names) index = make_index(sim_names) seed_idx = np.array([index[g] for g in seed]) non_seed_idx = np.setdiff1d(np.arange(len(sim_names)), seed_idx) # if seed_idx.size == 1: # seed_idx = np.array([seed_idx]) # sim_slice = np.array([sim_slice]) # Calculate a similarity score between each gene and the seed # set of genes sim_slice = sim[seed_idx, :] if agg=='mean': # Average similarity to the seed set # Don't include self in the average. sim_2_seed = sim_slice.sum(0) sim_2_seed[seed_idx] -= sim[seed_idx, seed_idx] sim_2_seed[seed_idx] /= float(seed_idx.size - 1) sim_2_seed[np.setdiff1d(np.arange(sim_2_seed.size), seed_idx)] /= float(seed_idx.size) ## Just take a simple average including self. ## This method has the problem is that it's not clear what to set the similarity to self # sim_2_seed = sim_slice.mean(0) # print sim_2_seed.size, np.diagonal(sim_slice).size, sim_slice.shape # sim_2_seed -= np.diagonal(sim_slice) # sim_2_seed = sim_2_seed / float(sim_2_seed.shape[0] - 1) elif agg=='min': # The minimum similarity to any gene in the seed set sim_2_seed = sim_slice.min(0) elif agg=='max': # The maximum similarity to any gene in the seed set sim_2_seed = sim_slice.max(0) elif agg=='perc': # The <agg_perc> percentile of similarities to the seed set. # For example, if a gene has similarities of (0, 0.2, 0.4, # 0.6, 0.8) to five seed genes, then the 10% similarity is 0.2 sim_2_seed = np.percentile(100 * agg_perc, sim_slice, axis=0) else: raise Exception('Unsupported aggregation method: %s' % agg) # Temporarily remove seed from consideration (they'll be reinserted later), and decrease expand_size if include_seed: expand_idx = np.setdiff1d(np.arange(len(sim_names)), seed_idx) expand_idx = expand_idx[np.argsort(-1 * sim_2_seed[expand_idx])] if expand_size is not None: expand_size = max(0, expand_size - seed_idx.size) else: expand_idx = np.argsort(-1 * sim_2_seed) if expand_idx.size > 0: # Maximum limit on the number of returned genes if expand_size is not None: expand_idx = expand_idx[ : expand_size] # Filter based on a percentile of similarities between all genes and the seed set if filter_perc is not None: min_sim = max(min_sim, np.percentile(sim_2_seed, 100 * filter_perc)) # Filter based on a percentile of similarities between seed set to itself if seed_perc is not None: min_sim = max(min_sim, np.percentile(sim_2_seed[seed_idx], 100 * seed_perc)) if verbose: print('min_sim:', min_sim) expand_idx = expand_idx[sim_2_seed[expand_idx] >= min_sim] expand = np.array(sim_names)[expand_idx] expand_sim = sim_2_seed[expand_idx] # Include seed genes that didn't meet the criteria if include_seed: attach_idx = np.setdiff1d(seed_idx, expand_idx) if attach_idx.size > 0: expand = np.append(expand, sim_names[attach_idx]) expand_idx = np.append(expand_idx, attach_idx) expand_sim = np.append(expand_sim, sim_2_seed[attach_idx]) if figure: import matplotlib.pyplot as plt import seaborn as sns try: fig = plt.figure() ax = plt.gca() if non_seed_idx.size > 1: sns.distplot(sim_2_seed[non_seed_idx], kde=False, norm_hist=True, ax=ax, axlabel='Similarity to seed set', label='Probability Density') if seed_idx.size > 1: sns.distplot(sim_2_seed[seed_idx], kde=False, norm_hist=True, ax=ax, axlabel='Similarity to seed set', label='Probability Density') try: figure.savefig(figure) except: pass except: fig = None else: fig = None return expand, expand_idx, sim_2_seed, fig
[docs]def make_seed_ontology(sim, sim_names, expand_kwargs={}, build_kwargs={}, align_kwargs={}, ndex_kwargs={}, node_attr=None, verbose=False, ndex=True): """Assembles and analyzes a data-driven ontology to study a process or disease Parameters ---------- sim : np.ndarray gene-by-gene similarity array sim_names : array-like Names of genes as they appear in the rows and columns of <sim> expand_kwargs : dict Parameters for ddot.expand_seed() to identify an expanded set of genes build_kwargs : dict Parameters for Ontology.build_from_network(...) to build a data-driven ontology. align_kwargs : dict Parameters for Ontology.align() to align against a reference ontology. ndex_kwargs : dict Parameters for Ontology.to_ndex() to upload ontology to NDEx. node_attr : pd.DataFrame A DataFrame of node attributes to assign to the ontology. ndex : bool If True, then upload ontology to NDEx using parameters <ndex_kwargs> """ assert 'seed' in expand_kwargs seed = expand_kwargs['seed'] ################ # Expand genes # ################ if verbose: print('----------------') print('Expanding genes') print('----------------') kwargs = {'sim': sim, 'sim_names': sim_names} kwargs.update(expand_kwargs) expand, expand_idx, sim_2_seed, fig = expand_seed(**kwargs) expand_results = {'expand' : expand, 'sim_2_seed' : sim_2_seed, 'fig' : fig} expand = list(expand) if verbose: print('Seed genes:', len(seed)) print('Expand genes:', len(expand)) ################## # Build Ontology # ################## if verbose: print('-----------------') print('Building ontology') print('-----------------') # Slice the similarity matrix over the expanded gene set and # convert to a square dataframe df_sq = pd.DataFrame(sim[expand_idx, :][:, expand_idx], index=expand, columns=expand) # Convert square to long format df = melt_square(df_sq) # Build data-driven ontology ont = ddot.Ontology.infer_ontology(df, verbose=verbose, **build_kwargs) ############################### # Align to Reference Ontology # ############################### if 'hier' in align_kwargs: if verbose: print('------------------') print('Aligning Ontology') print('------------------') alignment = ont.align(**align_kwargs) if verbose: print('Alignment: %s alignment matches' % alignment.shape[0]) ############################# # Set other node attributes # ############################# # Annotate which genes were part of the seed set seed_set = set(seed) seed_attr = pd.DataFrame({'Seed' : [g in seed_set for g in ont.genes]}, index=ont.genes) ont.update_node_attr(seed_attr) # Annotate the data similarity to the seed set tmp = make_index(sim_names) sim_attr = pd.DataFrame({'Similarity_2_Seed' : [sim_2_seed[tmp[g]] for g in ont.genes]}, index=ont.genes) ont.update_node_attr(sim_attr) # Annotate user-specified node attributes if node_attr is not None: ont.update_node_attr(node_attr) # Color seed genes as green (hex #6ACC65) fill_attr = pd.DataFrame({'Vis:Fill Color' : '#6ACC65'}, index=seed) ont.update_node_attr(fill_attr) # Color terms according to the exactness of alignment if 'Aligned_Similarity' in ont.node_attr.columns: fill_attr = ont.node_attr['Aligned_Similarity'].dropna().map(color_gradient) fill_attr = fill_attr.to_frame().rename(columns={'Aligned_Similarity' : 'Vis:Fill Color'}) ont.update_node_attr(fill_attr) ################## # Upload to NDEx # ################## if ndex: if verbose: print('--------------------------') print('Uploading Ontology to NDEx') print('--------------------------') if 'network' not in ndex_kwargs: ndex_kwargs['network'] = df ndex_kwargs['features'] = ['similarity'] ndex_kwargs['main_feature'] = 'similarity' description = ( 'Data-driven ontology created by the function ddot.make_seed_ontology()' 'in the DDOT Python package (https://github.com/michaelkyu/ontology)' '(parameters: %s' % ', '.join(['%s=%s' % (k,v) for k,v in build_kwargs.items()]) ) ont_url, ont_ndexgraph = ont.to_ndex( description=description, verbose=verbose, **ndex_kwargs ) else: ont_url, ont_ndexgraph = None, None return ont, ont_url, ont_ndexgraph, expand_results
def make_network_public(uuid, ndex_server, ndex_user, ndex_pass, timeout=60, error=False): ndex = nc.Ndex(ndex_server, ndex_user, ndex_pass) sleep_time = 0.25 start = time.time() while True: if time.time() - start > timeout: print('Failed to make network public: error message:') print(traceback.print_exc()) if error: raise Exception('Could not make the NDEX network %s public' % uuid) else: break else: try: ndex.make_network_public(uuid) break except: time.sleep(sleep_time) sleep_time = min(5, 2 * sleep_time)
[docs]def ig_unfold_tree_with_attr(g, sources, mode): """Call igraph.Graph.unfold_tree while preserving vertex and edge attributes. """ g_unfold, g_map = g.unfold_tree(sources, mode=mode) g_eids = g.get_eids([(g_map[e.source], g_map[e.target]) for e in g_unfold.es]) for attr in g.edge_attributes(): g_unfold.es[attr] = g.es[g_eids][attr] for attr in g.vertex_attributes(): g_unfold.vs[attr] = g.vs[g_map][attr] return g_unfold
[docs]def gridify(parents, pos, G): """Relayout leaf nodes into a grid. Nodes must be connected and already laid out in "star"-like topologies. In each "star", a set of nodes are positioned to form the shape of a circle and connect to a common parent node that is positioned at the circle's center. This function repositions the nodes in each start into a square grid that inscribes the circle. Parameters ---------- parents : list For each parent, its children will be arranged in a grid. pos : dict Dictionary that maps names of nodes to their (x,y) coordinates G : nx.Graph Network Returns ------- : None Modifies <pos> inplace """ for v in parents: x_center, y_center = pos[v] children = list(G.predecessors(v)) if len(children) <= 1: continue # Estimate radius by averaging the distance to the children radius = np.mean([np.sqrt((pos[c][0] - x_center)**2 + (pos[c][1] - y_center)**2) for c in children]) # Width of the the square inscribing the circle is sqrt(2) * # radius width = np.sqrt(2) * radius # Reposition nodes into a square grid num_cols = int(np.ceil(np.sqrt(len(children)))) col_space = width / (num_cols - 1) row_space = width / (num_cols - 1) topleft = x_center - width / 2, y_center - width / 2 for i, c in enumerate(children): row, col = i / num_cols, i % num_cols pos[c] = (topleft[0] + col * col_space, topleft[1] + row * row_space) # Reposition the center node to be 1/3rd of the distance from # the outside of the circle to the center's parent p = list(G.successors(v))[0] x_parent, y_parent = pos[p] distance = np.sqrt((x_parent - x_center)**2 + (y_parent - y_center)**2) alpha = (2/3.) * (distance - radius) / distance pos[v] = (x_center * (1 - alpha) + x_parent * alpha, y_center * (1 - alpha) + y_parent * alpha)
def nx_set_tree_edges(G, tree_edges): nx.set_edge_attributes( G, values={(s,t) : 'Tree' if ((s,t) in tree_edges) else 'Not_Tree' for s, t in G.edges(data=False)}, name='Is_Tree_Edge' )
[docs]def color_gradient(ratio, min_col='#FFFFFF', max_col='#D65F5F', output_hex=True): """Calculate a proportional mix between two colors. """ min_col_hex = min_col.lstrip('#') min_col_rgb = tuple(int(min_col_hex[i:i+2], 16) for i in (0, 2 ,4)) max_col_hex = max_col.lstrip('#') max_col_rgb = tuple(int(max_col_hex[i:i+2], 16) for i in (0, 2 ,4)) mix_col_rgb = [int(ratio*x + (1-ratio)*y) for x, y in zip(max_col_rgb, min_col_rgb)] if output_hex: return('#%02x%02x%02x' % tuple(mix_col_rgb)).upper() else: return mix_col_rgb