Source code for gossipcat.graph.GraphFE

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
author:     Ewen Wang
email:      wolfgangwong2012@gmail.com
license:    Apache License 2.0
"""
import pandas as pd
import numpy as np
import networkx as nx
import itertools 

def link_pred_generator(function):
    """
    Decorator that generates a link prediction function.

    The decorated function should take a graph and a list of (source, target) pairs as input, and return a list of predictions for the links. Each prediction should be a probability between 0 and 1.

    The generated link prediction function takes a graph, a source node, and a target node as input, and returns the probability that there is a link between the source and target nodes.

    Args:
        function: The function to decorate.

    Returns:
        A link prediction function.
    """
    def link_pred(graph, source, target):
        for u, v, p in function(graph, [(source, target)]):
            return p
    return link_pred

def hitting_time(nodelist, adj, source, target):
    """
    Calculates the hitting time from a source node to a target node in a directed graph.

    The hitting time is the minimum number of edges that must be traversed to reach the target node from the source node.

    Args:
        nodelist: A list of nodes in the graph.
        adj: The adjacency matrix of the graph.
        source: The index of the source node in the `nodelist`.
        target: The index of the target node in the `nodelist`.

    Returns:
        The hitting time from the source node to the target node, or -1 if the target node is unreachable from the source node.
    """
    hit_ind = (nodelist.index(source), nodelist.index(target))
    A = adj.copy()
    A[hit_ind[1],:] = 0
    A[hit_ind[1], hit_ind[1]] = 1
    A = (A.T/A.sum(axis=1)).T
    B = A.copy()
    prob = 0
    n = 0
    while prob < 0.99 and n < 100:
        prob = B[hit_ind]
        B = np.dot(B, A)
        n += 1
    return n
    
def edge(graph):
    """
    Calculates various network metrics for edges in a graph.

    This function takes a graph as input and returns a Pandas DataFrame containing the following network metrics for each edge:

    * Hitting time: The minimum number of edges that must be traversed to reach the target node from the source node.
    * Shortest path length: The length of the shortest path between the source and target nodes.
    * Efficiency: A measure of how efficiently the source and target nodes are connected.
    * Jaccard coefficient: A measure of the similarity between the neighborhoods of the source and target nodes.
    * Resource allocation index: A measure of how likely it is that the source and target nodes are connected, based on the amount of shared resources they have.
    * Adamic-Adar index: A measure of how likely it is that the source and target nodes are connected, based on their common neighbors.
    * Preferential attachment: A measure of how likely it is that the source and target nodes are connected, based on their degrees.

    Args:
        graph: The graph for which to calculate the network metrics.

    Returns:
        A Pandas DataFrame containing the network metrics for each edge.
    """
    edge_attr = []
    nodelist = list(graph)
    adj = nx.adj_matrix(graph)
    edge_attr = pd.DataFrame(list(itertools.combinations(list(graph.nodes.keys()), r=2)), columns=['source', 'target'])
    edge_attr['hitting_time'] = edge_attr.apply(lambda x: hitting_time(nodelist, adj, x[0], x[1]), axis=1)
    edge_attr['shortest_path_length'] = edge_attr.apply(lambda x: nx.shortest_path_length(graph, x[0], x[1]) if nx.has_path(graph, x[0], x[1]) else 0, axis=1)
    edge_attr['efficiency'] = edge_attr.apply(lambda x: nx.efficiency(graph, x[0], x[1]), axis=1)
    edge_attr['jaccard_coefficient'] = edge_attr.apply(lambda x: link_pred_generator(nx.jaccard_coefficient)(graph, x[0], x[1]), axis=1)
    edge_attr['resource_allocation_index'] = edge_attr.apply(lambda x: link_pred_generator(nx.resource_allocation_index)(graph, x[0], x[1]), axis=1)
    edge_attr['adamic_adar_index'] = edge_attr.apply(lambda x: link_pred_generator(nx.adamic_adar_index)(graph, x[0], x[1]), axis=1)
    edge_attr['preferential_attachment'] = edge_attr.apply(lambda x: link_pred_generator(nx.preferential_attachment)(graph, x[0], x[1]), axis=1)
    return edge_attr


[docs] class Attribute(object): """Generate all node-based, edge-based, and graph-based attributes of all connected components in a whole graph. """ def __init__(self, graph): """Initialize the class and generate graph attributes. Initializes the class by extracting connected components and creating dataframes for graph, node, edge, and pair attributes. Args: graph (networkx.Graph): The input graph. """ self.graphs = list(graph.subgraph(c) for c in nx.connected_components(graph)) self.graph = self.graphs[0] if len(self.graphs)>1: self.all_attr = pd.DataFrame() print("Note: "+str(len(self.graphs))+" connected components are contained.") self.graph_attr = pd.DataFrame() self.node_attr = pd.DataFrame() self.edge_attr = pd.DataFrame() self.pair_attr = pd.DataFrame() def _graph(self): """Generate graph-based attributes. Computes various graph-based attributes including number of nodes, edges, self-loops, density, and centrality measures. Returns: pd.DataFrame: DataFrame containing graph-based attributes. """ self.graph_attr['number_of_nodes'] = [nx.number_of_nodes(self.graph)] self.graph_attr['number_of_edges'] = [nx.number_of_edges(self.graph)] self.graph_attr['number_of_selfloops'] = [nx.number_of_selfloops(self.graph)] self.graph_attr['graph_number_of_cliques'] = [nx.graph_number_of_cliques(self.graph)] self.graph_attr['graph_clique_number'] = [nx.graph_clique_number(self.graph)] self.graph_attr['density'] = [nx.density(self.graph)] self.graph_attr['transitivity'] = [nx.transitivity(self.graph)] self.graph_attr['average_clustering'] = [nx.average_clustering(self.graph)] self.graph_attr['radius'] = [nx.radius(self.graph)] self.graph_attr['is_tree'] = [1 if nx.is_tree(self.graph) else 0] self.graph_attr['wiener_index'] = [nx.wiener_index(self.graph)] return self.graph_attr def _node(self): """Generate node-based attributes. Computes degree centrality, closeness centrality, betweenness centrality, and pagerank for each node. Returns: pd.DataFrame: DataFrame containing node-based attributes. """ degree_cent = pd.DataFrame(list(nx.degree_centrality(self.graph).items()), columns=['node', 'degree_centrality']) closenessCent = pd.DataFrame(list(nx.closeness_centrality(self.graph).items()), columns=['node', 'closeness_centrality']) betweennessCent = pd.DataFrame(list(nx.betweenness_centrality(self.graph).items()), columns=['node', 'betweenness_centrality']) pagerank = pd.DataFrame(list(nx.pagerank(self.graph).items()), columns=['node', 'pagerank']) self.node_attr = degree_cent self.node_attr['closeness_centrality'] = closenessCent['closeness_centrality'] self.node_attr['betweenness_centrality'] = betweennessCent['betweenness_centrality'] self.node_attr['pagerank'] = pagerank['pagerank'] return self.node_attr def _edge(self): """Generate edge-based attributes. Computes edge attributes for the graph. Returns: pd.DataFrame: DataFrame containing edge-based attributes. """ self.edge_attr = edge(self.graph) return self.edge_attr
[docs] def sigTabular(self): """Combine all node-based, edge-based, and graph-based attributes of a single connected component. Merges node and edge attributes and creates a combined DataFrame for a single connected component. Returns: pd.DataFrame: Combined DataFrame of node, edge, and graph-based attributes for a single connected component. """ self.node_attr = self._node(self.graph) self.edge_attr = self._edge(self.graph) self.graph_attr = self._graph(self.graph) self.pair_attr = self.edge_attr.merge(self.node_attr, how='left', left_on='source', right_on='node').merge(self.node_attr, how='left', left_on='target', right_on='node') self.pair_attr = self.pair_attr.drop(['node_x', 'node_y'], axis=1) graph_attr_l = ['number_of_nodes', 'number_of_edges', 'number_of_selfloops', 'graph_number_of_cliques', 'graph_clique_number', 'density', 'transitivity', 'average_clustering', 'radius', 'is_tree', 'wiener_index'] for i in graph_attr_l: self.pair_attr[i] = self.graph_attr[i][0] return self.pair_attr
[docs] def mulTabular(self): """Combine all node-based, edge-based, and graph-based attributes of all connected components in the whole graph. Combines attributes for all connected components in the graph into a single DataFrame. Returns: pd.DataFrame: Combined DataFrame of node, edge, and graph-based attributes for all connected components in the graph. """ for ind, graph in enumerate(self.graphs): self.graph = graph self.pair_attr = self.sigTabular() if ind==0: self.all_attr = self.pair_attr else: self.all_attr = pd.concat([self.all_attr, self.pair_attr]) return self.all_attr
[docs] class GFeature(object): """Feature engineering to add all node-based and graph-based attributes of all connected components in a whole graph. """ def __init__(self, df, source, target): """Initialize the feature engineering class. Initializes the class variables and extracts connected components for the provided graph. Args: df (pd.DataFrame): DataFrame with source and target nodes. source (str): Source node name. target (str): Target node name. """ self.df = df self.source = source self.target = target self.g = nx.from_pandas_edgelist(df=self.df, source=self.source, target=self.target) self.graphs = list(self.g.subgraph(c) for c in nx.connected_components(self.g)) print("Note: "+str(len(self.graphs))+" connected components are contained.") self.graph_attr = pd.DataFrame() self.node_attr = pd.DataFrame() self.df_r = pd.DataFrame() def _graph(self, graph): """Generate graph-based attributes for a given graph. Computes various graph-based attributes for the provided graph, including number of nodes, edges, self-loops, density, and centrality measures. Args: graph (nx.Graph): The input graph. Returns: pd.DataFrame: DataFrame containing graph-based attributes. """ graph_attr = pd.DataFrame() graph_attr['number_of_nodes'] = [nx.number_of_nodes(graph)] graph_attr['number_of_edges'] = [nx.number_of_edges(graph)] graph_attr['number_of_selfloops'] = [nx.number_of_selfloops(graph)] graph_attr['graph_number_of_cliques'] = [nx.graph_number_of_cliques(graph)] graph_attr['graph_clique_number'] = [nx.graph_clique_number(graph)] graph_attr['density'] = [nx.density(graph)] graph_attr['transitivity'] = [nx.transitivity(graph)] graph_attr['average_clustering'] = [nx.average_clustering(graph)] graph_attr['radius'] = [nx.radius(graph)] graph_attr['is_tree'] = [1 if nx.is_tree(graph) else 0] graph_attr['wiener_index'] = [nx.wiener_index(graph)] return graph_attr def _node(self, graph): """Generate node-based attributes for a given graph. Computes degree centrality, closeness centrality, betweenness centrality, and pagerank for each node in the provided graph. Args: graph (nx.Graph): The input graph. Returns: pd.DataFrame: DataFrame containing node-based attributes. """ node_attr = pd.DataFrame() degree_cent = pd.DataFrame(list(nx.degree_centrality(graph).items()), columns=['node', 'degree_centrality']) closenessCent = pd.DataFrame(list(nx.closeness_centrality(graph).items()), columns=['node', 'closeness_centrality']) betweennessCent = pd.DataFrame(list(nx.betweenness_centrality(graph).items()), columns=['node', 'betweenness_centrality']) pagerank = pd.DataFrame(list(nx.pagerank(graph).items()), columns=['node', 'pagerank']) node_attr = degree_cent node_attr['closeness_centrality'] = closenessCent['closeness_centrality'] node_attr['betweenness_centrality'] = betweennessCent['betweenness_centrality'] node_attr['pagerank'] = pagerank['pagerank'] return node_attr
[docs] def signleGraphFeatures(self, graph, df): """Combine all node-based, edge-based, and graph-based attributes of a single connected component. Merges node and graph attributes to create a combined DataFrame for a single connected component. Args: graph (nx.Graph): The input graph representing a single connected component. df (pd.DataFrame): DataFrame with source and target nodes. Returns: pd.DataFrame: Combined DataFrame of node, edge, and graph-based attributes for a single connected component. """ node_attr = self._node(graph) graph_attr = self._graph(graph) df = df.merge(node_attr, how='left', left_on='srcIp', right_on='node') df = df.drop(['node'], axis=1) graph_attr_l = ['number_of_nodes', 'number_of_edges', 'number_of_selfloops', 'graph_number_of_cliques', 'graph_clique_number', 'density', 'transitivity', 'average_clustering', 'radius', 'is_tree', 'wiener_index'] for i in graph_attr_l: node_attr[i] = graph_attr[i][0] df = df.merge(node_attr, how='left', left_on='destIp', right_on='node') df = df.drop(['node'], axis=1) return df
[docs] def graphFeaturesUpdate(self, graph, df, d_r): """ Update graph-based attributes in a given DataFrame. Combines existing graph features with the features of the provided graph and updates the DataFrame. Args: graph (nx.Graph): The input graph representing a single connected component. df (pd.DataFrame): DataFrame with source and target nodes. d_r (pd.DataFrame): DataFrame with existing graph-based attributes. Returns: pd.DataFrame: Updated DataFrame with combined graph features. """ t = self.signleGraphFeatures(graph, df) d_r.update(t, overwrite=False) return d_r
[docs] def generate(self): """Generate graph features for all connected components. Iterates through all connected components in the graph and generates combined graph features. Returns: pd.DataFrame: DataFrame with graph features for all connected components. """ for ind, graph in enumerate(self.graphs): if ind == 0: self.d_r = self.signleGraphFeatures(graph, self.df) else: self.d_r = self.graphFeaturesUpdate(graph, self.df, self.d_r) return self.d_r
# def _graph(graph): # """Generate graph-based attributes.""" # graph_attr = pd.DataFrame() # graph_attr['number_of_nodes'] = [nx.number_of_nodes(graph)] # graph_attr['number_of_edges'] = [nx.number_of_edges(graph)] # graph_attr['number_of_selfloops'] = [nx.number_of_selfloops(graph)] # graph_attr['graph_number_of_cliques'] = [nx.graph_number_of_cliques(graph)] # graph_attr['graph_clique_number'] = [nx.graph_clique_number(graph)] # graph_attr['density'] = [nx.density(graph)] # graph_attr['transitivity'] = [nx.transitivity(graph)] # graph_attr['average_clustering'] = [nx.average_clustering(graph)] # graph_attr['radius'] = [nx.radius(graph)] # graph_attr['is_tree'] = [1 if nx.is_tree(graph) else 0] # graph_attr['wiener_index'] = [nx.wiener_index(graph)] # return graph_attr # def _node(graph): # """Generate node-based attributes.""" # node_attr = pd.DataFrame() # degree_cent = pd.DataFrame(list(nx.degree_centrality(graph).items()), columns=['node', 'degree_centrality']) # closenessCent = pd.DataFrame(list(nx.closeness_centrality(graph).items()), columns=['node', 'closeness_centrality']) # betweennessCent = pd.DataFrame(list(nx.betweenness_centrality(graph).items()), columns=['node', 'betweenness_centrality']) # pagerank = pd.DataFrame(list(nx.pagerank(graph).items()), columns=['node', 'pagerank']) # node_attr = degree_cent # node_attr['closeness_centrality'] = closenessCent['closeness_centrality'] # node_attr['betweenness_centrality'] = betweennessCent['betweenness_centrality'] # node_attr['pagerank'] = pagerank['pagerank'] # return node_attr # def signleGraphFeatures(graph, df): # """Combine all node-based, edge-based, and graph-based attributes of a single connected component.""" # node_attr = _node(graph) # graph_attr = _graph(graph) # df = df.merge(node_attr, how='left', left_on='srcIp', right_on='node') # df = df.drop(['node'], axis=1) # graph_attr_l = ['number_of_nodes', 'number_of_edges', 'number_of_selfloops', 'graph_number_of_cliques', # 'graph_clique_number', 'density', 'transitivity', 'average_clustering', 'radius', 'is_tree', 'wiener_index'] # for i in graph_attr_l: # node_attr[i] = graph_attr[i][0] # df = df.merge(node_attr, how='left', left_on='destIp', right_on='node') # df = df.drop(['node'], axis=1) # return df # def graphFeaturesUpdate(graph, df, d_r): # """Combine all node-based, edge-based, and graph-based attributes of a single connected component.""" # t = signleGraphFeatures(graph, df) # d_r.update(t, overwrite=False) # return d_r # def generate(df, source, target): # """Feature engineering to add all node-based and graph-based attributes of all connected components in a whole graph. # Args: # df: dataframe with source and target nodes. # source: source node name. # target: target node name. # Returns: # A DataFrame with graph features. # """ # g = nx.from_pandas_edgelist(df=df, source=source, target=target) # graphs = list(g.subgraph(c) for c in nx.connected_components(g)) # print("Note: "+str(len(graphs))+" connected components are contained.") # for ind, graph in enumerate(graphs): # if ind == 0: # d_r = signleGraphFeatures(graph, df) # else: # d_r = graphFeaturesUpdate(graph, df, d_r) # return d_r