Skip to content

routes

find_compound_outflux_routes(db, compound_id, reaction_gene_expression, expressed_genes=None, drop_nodes=None, max_level=10, max_num_routes=1000, expression_coef=1.0, structure_similarity_coef=10.0, route_length_coef=0.0)

Find how a compound is consumed. Genes are either expressed or unexpressed, so this is a rather binary view of the metabolic network.

When searching for routes, we start from the compound of interest, and expand to other compounds via reactions that are:

  • TODO: reversible, or
  • (physiol_)left_to_right, with the compound of interest on the left.

Whenever we encounter a reaction that's catalyzed by an unexpressed enzyme, we stop the path search. Reactions that are not catalyzed by enzymes are always included in the search.

For details on how the optional parameters are used, see: https://neo4j.com/labs/apoc/4.4/overview/apoc.path/apoc.path.expandConfig/

Parameters:

Name Type Description Default
db Neo4jClient

A Neo4j client connected to the graph database.

required
compound_id str

The metaId of the compound of interest.

required
reaction_gene_expression ReactionGeneMap

Expression levels of the reactions.

required
expressed_genes Iterable[str]

A list of genes that are considered expressed. If not given, all genes are considered expressed.

None
drop_nodes list[str]

The metaId or name of blacklisted Reaction/Compound nodes.

None
max_level int

The maximum number of hops in the traversal.

10
max_num_routes int

The maximum number of routes to return.

1000
expression_coef float

Score coefficient for route expression level.

1.0
structure_similarity_coef float

Score coefficient for metabolite similarity score.

10.0
route_length_coef float

Score coefficient for length of route.

0.0

Returns:

Type Description
list[dict[str, Any]]

Paths to sinks of the given metabolite.

Source code in algo/routes.py
def find_compound_outflux_routes(
    db: Neo4jClient,
    compound_id: str,
    reaction_gene_expression: ReactionGeneMap,
    expressed_genes: Iterable[str] = None,
    drop_nodes: list[str] = None,
    max_level: int = 10,
    max_num_routes: int = 1000,
    expression_coef: float = 1.0,
    structure_similarity_coef: float = 10.0,
    route_length_coef: float = 0.0,
) -> list[dict[str, Any]]:
    """Find how a compound is consumed. Genes are either expressed or unexpressed, so this is a
    rather binary view of the metabolic network.

    When searching for routes, we start from the compound of interest, and
    expand to other compounds via reactions that are:

    * TODO: reversible, or
    * (physiol_)left_to_right, with the compound of interest on the left.

    Whenever we encounter a reaction that's catalyzed by an unexpressed enzyme,
    we stop the path search. Reactions that are not catalyzed by enzymes are
    always included in the search.

    For details on how the optional parameters are used, see:
    https://neo4j.com/labs/apoc/4.4/overview/apoc.path/apoc.path.expandConfig/

    Args:
        db: A Neo4j client connected to the graph database.
        compound_id: The metaId of the compound of interest.
        reaction_gene_expression: Expression levels of the reactions.
        expressed_genes: A list of genes that are considered expressed. If not
            given, all genes are considered expressed.
        drop_nodes: The metaId or name of blacklisted Reaction/Compound nodes.
        max_level: The maximum number of hops in the traversal.
        max_num_routes: The maximum number of routes to return.
        expression_coef: Score coefficient for route expression level.
        structure_similarity_coef: Score coefficient for metabolite similarity
            score.
        route_length_coef: Score coefficient for length of route.

    Returns:
        Paths to sinks of the given metabolite.
    """
    # We want to keep reactions wo/ genes, and reactions w/ expressed genes
    ignore_nodes = get_high_degree_compound_nodes(db)

    if expressed_genes is not None:
        valid_genes = set(expressed_genes)
        all_genes = get_table_of_gene_products(db)
        bad_rxns = [x["rxn_id"] for x in all_genes if x["gene"] not in valid_genes]
        bad_rev_rxns = [f"rev-{x}" for x in bad_rxns]
        ignore_nodes += bad_rxns
        ignore_nodes += bad_rev_rxns

    if drop_nodes is not None:
        ignore_nodes += drop_nodes

    if compound_id in ignore_nodes:
        ignore_nodes.remove(compound_id)
        logger.warning(
            "This may be slow as the compound is in ignore_nodes. Searching for some ubiquitous chemical?"
        )

    # Get a list of possible routes, where each route is represented by a list
    # of sequential nodes (dicts) of Compound->Reaction->Compound->Reaction...
    routes: list[dict[str, list[dict[str, str]]]] = db.read(
        """
        MATCH (n)
        WHERE (n:Compound OR n:Reaction) AND (
            (n.metaId IN $bad_nodes) OR
            (n.name IN $bad_nodes)
        )
        WITH COLLECT(n) AS ns

        MATCH (c:Compound {metaId: $cpd_id})
        CALL apoc.path.expandConfig(c, {
            beginSequenceAtStart: false,
            sequence: "<hasLeft,Reaction|ReverseReaction,hasRight>,Compound,<hasLeft",
            blacklistNodes: ns,
            uniqueness: "NODE_PATH",
            bfs: true,
            minLevel: 2,
            maxLevel: $max_hops,
            limit: $max_num_routes
        })
        YIELD path
        RETURN [x in nodes(path) | {name: x.name, id: x.metaId, nodeType: labels(x)[0]}] AS route;
        """,
        cpd_id=compound_id,
        bad_nodes=ignore_nodes,
        max_hops=max_level,
        max_num_routes=max_num_routes,
    )
    routes = list(routes)

    # When end metabolites don't have consuming reactions, their producing
    # reaction appears but the Compound nodes are missing in the path.
    terminal_rxns = {}
    for i, p in enumerate(routes):
        if p["route"][-1]["nodeType"] == "Reaction":
            # Neo4j can't deal with int keys
            terminal_rxns[str(i)] = p["route"][-1]["id"]

    terminal_cpds = db.read(
        """
        UNWIND keys($rxn_ids) AS rxn_idx
        MATCH (r:Reaction {metaId: $rxn_ids[rxn_idx]})-[:hasRight]->(c:Compound)
        WHERE NOT c.metaId IN $bad_nodes
        RETURN rxn_idx, {name: c.name, id: c.metaId} AS cpd;
        """,
        rxn_ids=terminal_rxns,
        bad_nodes=ignore_nodes,
    )
    for n in terminal_cpds:
        n["cpd"]["nodeType"] = "Compound"
        routes[int(n["rxn_idx"])]["route"].append(n["cpd"])

    # Another case is when the end metabolite is in `ignore_nodes`.
    # These end reactions are dropped.
    routes = [
        r for r in routes if r["route"][-1]["nodeType"] not in {"Reaction", "ReverseReaction"}
    ]

    # Get genes associated with each route
    rxns_in_route = {
        str(i): {x["id"] for x in r["route"] if x["nodeType"] in {"Reaction", "ReverseReaction"}}
        for i, r in enumerate(routes)
    }
    route_expression_levels = [
        reaction_gene_expression.get_route_expression(v) for v in rxns_in_route.values()
    ]

    # Get start and end metabolite structures of each route
    terminal_cpds = [x["route"][-1]["id"] for x in routes]
    query_cpds = list(set(terminal_cpds).union([compound_id]))
    terminal_cpd_structs = db.read(
        """
        MATCH (c:Compound)
        WHERE c.metaId IN $cpds
        RETURN c{.metaId, .smiles}
        """,
        cpds=query_cpds,
    )
    cpd_structs = {n["c"]["metaId"]: n["c"]["smiles"] for n in terminal_cpd_structs}
    if (structure_similarity_coef > 0.0) and (compound_id in cpd_structs):
        input_cpd_struct = cpd_structs[compound_id]
        route_struct_scores = [
            calc_tanimoto_similarity(input_cpd_struct, cpd_structs[x])
            if x != compound_id
            else None
            for x in terminal_cpds
        ]
    else:
        route_struct_scores = [None] * len(terminal_cpds)

    # Calculate the final score of each route
    route_lengths = [len([x for x in r["route"] if x["nodeType"] == "Compound"]) for r in routes]
    route_scores = []
    for e, s, l in zip(route_expression_levels, route_struct_scores, route_lengths):  # noqa: E741
        score_sign = 1.0 if e > 0 else -1.0
        score = expression_coef * abs(e) + route_length_coef * l
        if s is not None:
            score += structure_similarity_coef * s
        route_scores.append(score_sign * score)

    # Save genes in each route
    route_genes = []
    for r in routes:
        r_genes = set()
        for step in r["route"]:
            if step["nodeType"] == "Reaction":
                if step_genes := reaction_gene_expression.reaction_gp_mapping.get(step["id"]):
                    r_genes |= step_genes
        route_genes.append(r_genes)

    res = [
        {
            "score": s,
            "genes": g,
            "route": r["route"],
            "expression_level": exp,
            "tanimoto_similarity": struct,
        }
        for r, s, g, exp, struct in zip(
            routes,
            route_scores,
            route_genes,
            route_expression_levels,
            route_struct_scores,
        )
    ]
    res = sorted(res, key=lambda x: x["score"], reverse=True)
    return res

get_cpd_view_of_pathway(db, pathway_id, ignore_cpds=COMMON_COMPOUNDS)

Get the view of a pathway with primary compounds linked by reactions.

Parameters:

Name Type Description Default
pathway_id str

The pathway metaId or name.

required
ignore_cpds list[str]

A list of common compound names to ignore.

COMMON_COMPOUNDS
Source code in algo/routes.py
def get_cpd_view_of_pathway(
    db: Neo4jClient,
    pathway_id: str,
    ignore_cpds: list[str] = COMMON_COMPOUNDS,
):
    """Get the view of a pathway with primary compounds linked by reactions.

    Args:
        pathway_id: The pathway metaId or name.
        ignore_cpds: A list of common compound names to ignore.
    """
    return db.read(
        """
        MATCH (:Pathway {name: $pw_id})-[:hasReaction]->(r:Reaction),
            (r)-[lr]->(cr:Compound),
            (r)-[lp]->(cp:Compound)
        WHERE (
            ($pw_id IN lr.isPrimaryReactantInPathway) OR
            (r.reactionDirection = 'reversible')
        ) AND (
            ($pw_id IN lp.isPrimaryProductInPathway) OR
            (r.reactionDirection = 'reversible')
        ) AND (
            id(lr) <> id(lp)
        ) AND (
            NOT cr.name IN $ignore_cpds
        ) AND (
            NOT cp.name IN $ignore_cpds
        )
        RETURN cr, cp,
            apoc.create.vRelationship(cr, r.name, properties(r), cp);
        """,
        pw_id=pathway_id,
        ignore_cpds=ignore_cpds,
    )

get_fba_info_of_pathways(db, pathway_ids)

Retrieve the information of a pathway relevant to flux balance analysis.

Source code in algo/routes.py
def get_fba_info_of_pathways(db: Neo4jClient, pathway_ids: list[str]):
    """Retrieve the information of a pathway relevant to flux balance analysis."""
    q = db.read(
        """
    MATCH (p:Pathway)-[:hasReaction]->(r:Reaction)-[l:hasLeft|hasRight]->(c:Compound)-[:hasCompartment]->(cpt:Compartment)
    WHERE p.metaId IN $pw_ids
    WITH r, l, c, cpt
    OPTIONAL MATCH (r)-[:hasRDF {bioQualifier: 'is'}]->(rdf:RDF)
    RETURN
        r.metaId, r.name, r.synonyms, r.reactionDirection, r.gibbs0,
        rdf.ecCode,
        type(l), l.stoichiometry,
        c.metaId, c.name, c.chemicalFormula, c.gibbs0,
        cpt.metaId;
        """,
        pw_ids=pathway_ids,
    )

    # Use dataframe for easier manipulation
    df = pd.DataFrame(q)
    df.columns = [
        "reaction_id",
        "reaction_name",
        "reaction_synonyms",
        "reaction_direction",
        "reaction_gibbs0",
        "reaction_ec_enzymes",
        "side",
        "stoichiometry",
        "compound_id",
        "compound_name",
        "compound_formula",
        "compound_formation_gibbs0",
        "compound_compartment",
    ]
    all_rxns = df.reaction_id.unique()  # used for getting genes later

    # Fix stoichiometry so that side can be dropped
    df.stoichiometry = np.where(df.side == "hasLeft", -df.stoichiometry, df.stoichiometry)

    # Fix missing values
    df.reaction_id = df.reaction_name
    df.reaction_name = df.apply(
        lambda x: "|".join(x["reaction_synonyms"])
        if isinstance(x["reaction_synonyms"], list)
        else x["reaction_name"],
        axis=1,
    )
    df.fillna(
        {
            "reaction_gibbs0": pd.NA,
            "compound_formation_gibbs0": pd.NA,
            "reaction_ec_enzymes": pd.NA,
        },
        inplace=True,
    )
    df.drop(columns=["side", "reaction_synonyms"], inplace=True)

    # Get gene associations
    rxn_genes = {}
    for rxn in all_rxns:
        if genes := get_genes_of_reaction(db, rxn):
            rxn_genes[rxn] = generate_gene_reaction_rule(genes)

    return df, rxn_genes

get_genes_of_reaction(db, reaction_id)

Given a reaction metaId, return the gene products associated with it in the form of a list of (source, target) nodes. This is because certain reactions are associated with GeneProductSet or GeneProductComplex nodes, which represent OR and AND relationships, respectively.

Note that GeneProductSet could contain nested GeneProductComplex nodes.

Source code in algo/routes.py
def get_genes_of_reaction(
    db: Neo4jClient, reaction_id: str
) -> list[tuple[dict[str, str], dict[str, str]]]:
    """Given a reaction metaId, return the gene products associated with it in the form of a list
    of (source, target) nodes. This is because certain reactions are associated with
    ``GeneProductSet`` or ``GeneProductComplex`` nodes, which represent ``OR`` and ``AND``
    relationships, respectively.

    Note that ``GeneProductSet`` could contain nested ``GeneProductComplex`` nodes.
    """
    res = db.read(
        """
        MATCH (r:Reaction {metaId: $rxn})
        WITH r
        CALL apoc.path.expandConfig(r, {
            labelFilter: "GeneProductSet|GeneProductComplex|/GeneProduct",
            minLevel: 1,
            maxLevel: 3
        })
        YIELD path
        WITH apoc.path.elements(path) AS elements
        UNWIND range(0, size(elements)-2) AS index
        WITH elements, index
        WHERE index % 2 = 0
        RETURN DISTINCT
            {
            label: labels(elements[index])[0],
            metaId: elements[index].metaId,
            name: elements[index].name
            } AS source_node,
            {
            label: labels(elements[index+2])[0],
            metaId: elements[index+2].metaId,
            name: elements[index+2].name
            } AS target_node;
        """,
        rxn=reaction_id,
    )
    return [(x["source_node"], x["target_node"]) for x in res]

get_high_degree_compound_nodes(db, degree=45)

Get all compound nodes with a high degree, i.e. with a lot of edges to Reaction nodes. Including these nodes in many cases would pollute the graph with too many outgoing relationships.

Parameters:

Name Type Description Default
degree int

The degree threshold.

45

Returns:

Type Description
list[str]

A list of mcID of compound nodes.

Source code in algo/routes.py
def get_high_degree_compound_nodes(db: Neo4jClient, degree: int = 45) -> list[str]:
    """Get all compound nodes with a high degree, i.e. with a lot of edges to ``Reaction`` nodes.
    Including these nodes in many cases would pollute the graph with too many outgoing
    relationships.

    Args:
        degree: The degree threshold.

    Returns:
        A list of mcID of compound nodes.
    """
    res = db.read(
        """
        MATCH (c:Compound)<-[:hasLeft|hasRight]-(r:Reaction)
        WITH c.metaId as cpd, COUNT(*) AS cnt
        WHERE cnt >= $degree
        RETURN COLLECT(cpd) AS cpds;
        """,
        degree=degree,
    )
    return res[0]["cpds"]

get_reaction_route_between_compounds(db, c1, c2, only_pathway_reactions=True, ignore_node_metaids=[], num_routes=2, max_hops=10)

The function has two modes: one for following pre-defined pathways, and one for following any chain of reactions between two compounds.

Parameters:

Name Type Description Default
c1 str

The first compound metaId.

required
c2 str

The second compound metaId.

required
only_pathway_reactions bool

If True, only follow reactions in pathways.

True
ignore_node_metaids list[str]

A list of metaIds to ignore.

[]
num_routes int

The number of routes to return.

2
max_hops int

The maximum number of hops to follow. When argument only_pathway_reactions is False, this is doubled to account for the extra hops from reaction nodes to compound nodes.

10

Returns:

Type Description
list[dict[str, Any]]

A list of possible routes.

Source code in algo/routes.py
def get_reaction_route_between_compounds(
    db: Neo4jClient,
    c1: str,
    c2: str,
    only_pathway_reactions: bool = True,
    ignore_node_metaids: list[str] = [],
    num_routes: int = 2,
    max_hops: int = 10,
) -> list[dict[str, Any]]:
    """The function has two modes: one for following pre-defined pathways, and one for following
    any chain of reactions between two compounds.

    Args:
        c1: The first compound metaId.
        c2: The second compound metaId.
        only_pathway_reactions: If True, only follow reactions in pathways.
        ignore_node_metaids: A list of metaIds to ignore.
        num_routes: The number of routes to return.
        max_hops: The maximum number of hops to follow. When argument
            ``only_pathway_reactions`` is False, this is doubled to account
            for the extra hops from reaction nodes to compound nodes.

    Returns:
        A list of possible routes.
    """
    if only_pathway_reactions:
        query = """
        MATCH (n)
        WHERE n.metaId IN $ignore_nodes
        WITH COLLECT(n) AS ns
        MATCH (r:Reaction)-[:hasRight]->(c2:Compound {metaId: $c2})
        WITH c2, ns, COLLECT(r) AS rxns
        MATCH (c1:Compound {metaId: $c1})
        CALL apoc.path.expandConfig(c1, {
            relationshipFilter: "<hasLeft|isPrecedingEvent>|isRelatedEvent>",
            labelFilter: "+Reaction",
            terminatorNodes: rxns,
            blacklistNodes: ns,
            bfs: false,
            limit: $num_routes,
            minLevel: 2,
            maxLevel: $max_hops
        })
        YIELD path
        RETURN c2, path, length(path) AS hops
        ORDER BY hops;
        """
    else:
        query = """
        MATCH (n)
        WHERE n.metaId IN $ignore_nodes
        WITH COLLECT(n) AS ns
        MATCH (c1:Compound {metaId: $c1}),
                (c2:Compound {metaId: $c2})
        CALL apoc.path.expandConfig(c1, {
            relationshipFilter: "<hasLeft,hasRight>",
            labelFilter: "+Reaction|Compound",
            terminatorNodes: [c2],
            blacklistNodes: ns,
            bfs: true,
            limit: $num_routes,
            minLevel: 2,
            maxLevel: $max_hops
        })
        YIELD path
        RETURN c2, path, length(path) AS hops
        ORDER BY hops;
        """
    return db.read(
        query,
        ignore_nodes=ignore_node_metaids,
        c1=c1,
        c2=c2,
        num_routes=num_routes,
        max_hops=max_hops * 2,
    )

get_view_of_pathway(db, pathway_id)

Get the view of a pathway.

Parameters:

Name Type Description Default
pathway_id str

The pathway ID.

required

Returns:

Type Description
tuple[list[dict[str, Any]], list[dict, str, Any]]

The view of the pathway.

Source code in algo/routes.py
def get_view_of_pathway(
    db: Neo4jClient, pathway_id: str
) -> tuple[list[dict[str, Any]], list[dict, str, Any]]:
    """Get the view of a pathway.

    Args:
        pathway_id: The pathway ID.

    Returns:
        The view of the pathway.
    """
    nodes = db.read(
        """
    MATCH (:Pathway {metaId: $pw_id})-[l:hasReaction]->(r:Reaction)
    WITH r
    MATCH (r)-[:hasRDF {bioQualifier: 'is'}]->(rrdf:RDF),
            (r)-[:hasGeneProduct|hasComponent|hasMember*]->(gp:GeneProduct)-[:hasRDF {bioQualifier: 'isEncodedBy'}]->(grdf:RDF)
    WITH r, rrdf.ecCode AS ec,
            COLLECT(gp.name) as symbol, COLLECT(grdf.ncbigene) AS ncbi
    RETURN {
        id: id(r), name: r.name, ec: ec, gibbs: r.gibbs0,
        direction: r.reactionDirection,
        genes: {symbol: symbol, ncbi: ncbi}
    } AS node;
        """,
        pw_id=pathway_id,
    )

    edges = db.read(
        """
    MATCH (:Pathway {metaId: $pw_id})-[l:hasReaction]->(r:Reaction)
    WITH COLLECT(r) AS nodes
    UNWIND nodes AS r1
    UNWIND nodes AS r2
    WITH * WHERE id(r1) < id(r2)
    MATCH p=(r1)-[:isPrecedingEvent]-(r2)
    UNWIND relationships(p) AS edge
    RETURN {
        id: id(edge), source: id(startNode(edge)), target: id(endNode(edge))
    } AS edge;
        """,
        pw_id=pathway_id,
    )

    return [x["node"] for x in nodes], [x["edge"] for x in edges]