compounds

Data structures for storing all compounds from the Neo4j graph.

`CompoundMap(db)`

Source code in algo/compounds.py

def __init__(self, db: Neo4jClient):
    self.db = db
    self.id_table, self.cpds = self.get_all_compounds()

`compound_exact_match(query)`

Finds biocyc IDs corresponding to the query.

Source code in algo/compounds.py

def compound_exact_match(self, query: str):
    """Finds biocyc IDs corresponding to the query."""
    res = self.cpds.query("value_lowercase == @query")

    # Return biocyc ID directly if there's an exact match
    if res.shape[0] != 0:
        return list(set(res["biocyc"]))

`get_all_compounds()`

See the BioCyc concepts guide for detailed descriptions.

Returns:

Type	Description
`tuple[pd.DataFrame, pd.DataFrame]`	A tuple of two `pandas.DataFrame`s: One with columns `metaId`, `compartment`, and `biocyc`; the other with columns `biocyc`, `key_id` and `value`.

Source code in algo/compounds.py

def get_all_compounds(self) -> tuple[pd.DataFrame, pd.DataFrame]:
    """See the [BioCyc concepts guide](https://biocyc.org/PGDBConceptsGuide.shtml#TAG:__tex2pag
    e_toc_TAG:__tex2page_sec_4.4) for detailed descriptions.

    Returns:
        A tuple of two `pandas.DataFrame`s:
            One with columns `metaId`, `compartment`, and `biocyc`;
            the other with columns `biocyc`, `key_id` and `value`.
    """
    cpds = self.db.read(
        """
        MATCH (c:Compound)
        OPTIONAL MATCH (c)-[:hasRDF {bioQualifier: 'is'}]->(rdf:RDF)
        OPTIONAL MATCH (c)-[:hasCompartment]->(cpt:Compartment)
        RETURN c{.metaId, .name, .commonName, .synonyms, .smiles},
               rdf{.biocyc, .chebi, .keggCompound, .inchikey,
                   .chemspider, .pubchemCompound, .hmdb, .cas,
                   .metabolights, .lipidmaps, .drugbank, .knapsack,
                   .umbbdCompound, .keggGlycan
               },
               cpt.metaId AS compartmentId, cpt.commonName AS compartment;
        """
    )  # some don't have the RDF/Compartment fields
    res = [None] * len(cpds)
    for i, cpd in enumerate(cpds):
        entry = cpd["c"]
        if cpd["rdf"] is not None:
            entry = {**entry, **cpd["rdf"]}

        # Prefer compartment commonName, fallback to metaId
        # CCO: Cell Component Ontology; CCO-CYTOSOL is the default location
        if cpd["compartmentId"] is not None:
            entry["compartment"] = cpd["compartmentId"]
        if cpd["compartment"] is not None:
            entry["compartment"] = cpd["compartment"]

        res[i] = entry

    # Convert wide to long table for easier queries
    res = pd.DataFrame(res)
    res = res[~res["biocyc"].isna()]  # POLYMER-xxx, not important
    metaid_to_biocyc = res.filter(["metaId", "biocyc", "compartment"])
    res_long = (
        res.drop(columns=["metaId", "compartment"])
        .melt(id_vars=["biocyc"], var_name="key_id")
        .query("value.notna()")
        .explode("value")
        .drop_duplicates()
        .assign(value_lowercase=lambda r: r["value"].str.lower())
    )

    return metaid_to_biocyc, res_long

`search_compound_biocyc_id(query, **kwargs)`

Query for the compound BioCyc ID.

If the query doesn't have an exact full match, performs a fuzzy match and returns the top hits.

Parameters:

Name	Type	Description	Default
`query`	`str`	anything that resembles a BioCyc compound ID.	required
`**kwargs`		passed to difflib.get_close_matches.	`{}`

Returns:

Type	Description
`dict[str, Union[str, list[str], bool]]`	The compound biocyc ID if it exists.

Source code in algo/compounds.py

def search_compound_biocyc_id(
    self, query: str, **kwargs
) -> dict[str, Union[str, list[str], bool]]:
    """Query for the compound BioCyc ID.

    If the query doesn't have an exact full match, performs a fuzzy match and returns the top hits.

    Args:
        query: anything that resembles a BioCyc compound ID.
        **kwargs: passed to [difflib.get_close_matches][].

    Returns:
         The compound biocyc ID if it exists.
    """
    res = {"query": query, "is_fuzzy_match": False}

    # Direct return if query is a biocyc ID
    if query in self.id_table["biocyc"]:
        res["hits"] = [query]
        return res

    # Try exact matches first
    exact_hits = self.compound_exact_match(query)
    if exact_hits:
        res["hits"] = exact_hits
        return res

    # Try fuzzy search if there's no exact biocyc ID matches
    value_matches = get_close_matches(query.lower(), self.cpds["value_lowercase"], **kwargs)
    if value_matches:
        hits = [x for v in value_matches for x in self.compound_exact_match(v)]
        res["hits"] = list(set(hits))
        res["is_fuzzy_match"] = True
        return res

    res["hits"] = []
    return res

`search_compound_compartment(query)`

Given a biocyc ID, return compartments it is in.

Source code in algo/compounds.py

def search_compound_compartment(self, query: str):
    """Given a biocyc ID, return compartments it is in."""
    res = self.id_table.loc[self.id_table.biocyc == query, "compartment"]
    return list(set(res.values))

`search_compound_metaid_in_compartment(query, compartment='cytosol')`

Find the first compound ID in a given compartment.

Source code in algo/compounds.py

def search_compound_metaid_in_compartment(self, query: str, compartment: str = "cytosol"):
    """Find the first compound ID in a given compartment."""
    res = self.id_table.query("biocyc == @query").query("compartment == @compartment").metaId
    return res.tolist()[0]