Source code for kgx.utils.rdf_utils

import logging
from typing import List, Union
import rdflib
from rdflib import Namespace, URIRef
from rdflib.namespace import RDF, RDFS, OWL
from prefixcommons.curie_util import expand_uri
from kgx.utils.graph_utils import get_category_via_superclass
from kgx.utils.kgx_utils import get_toolkit, get_curie_lookup_service, make_curie

toolkit = get_toolkit()
m = toolkit.generator.mappings
x = set()
mapping = {}
for key, value in m.items():
    k = expand_uri(key)
    v = toolkit.get_by_mapping(key)
    if k == key:
        mapping[k] = v

OBAN = Namespace('')
BIOLINK = Namespace('')

predicate_mapping = {
    '': 'has_phenotype',
    '': 'has_disposition',
    '': 'causes_condition',
    '': 'is_subsequence_of',
    '': 'has_subsequence',
    OWL.sameAs.lower(): 'same_as',
    OWL.equivalentClass.lower(): 'same_as',
    OWL.inverseOf.lower(): 'inverse_of',
    RDFS.subClassOf.lower(): 'subclass_of',
    RDFS.subPropertyOf.lower(): 'subproperty_of',

        '{}{}'.format(BIOLINK, n) : n
            for n in
        [x.replace(',', '').replace(' ', '_') for x in toolkit.descendents('related to')]


# TODO: consolidate
category_mapping = {
# subclasses mapped onto their superclasses:
    "": "sequence_feature",
    "": "sequence_feature",
    "": "sequence_feature",
    "": "sequence_feature",
    "": "sequence_feature",
    "": "transcript",
    "": "sequence_feature",
    "": "transcript",
    "": "transcript",
    "": "gene",
    "": "sequence_variant",
    '': 'phenotypic_feature',
    "": "cell",
    "": "anatomical_entity",
    "": "cell",
    "": "anatomical_projection",
    "": "multi_cellular_organism",
    "": "brain",
    "": "quality",
    "": "cell",
    "": "organism",
    "": "cell",
    "": "disease",
    "": "assay",
    "": "process",
    "": "age",
    "": "brain",
    "": "bony_projection",
    "": "extended_life_span",
    "": "life_span_variant",
    "": "shortened_life_span",
    "": "molecular_entity",
    "": "drug",
    "": "chemical_role",
    "": "phenotypic_feature",
    "": "biological_process",
    "": "cellular_component",
    "": "gene",
    "": "sequence_feature",
    "": "genotype",

        '{}{}'.format(BIOLINK, n.replace(',', '').title().replace(' ', '')): n for n in toolkit.descendents('named thing')

property_mapping = {
    OBAN.association_has_subject: 'subject',
    OBAN.association_has_object: 'object',
    OBAN.association_has_predicate: 'predicate', 'name',
    RDFS.label: 'name',
    RDF.type: 'type',
    URIRef(''): 'type',
    BIOLINK.description: 'description',
    URIRef(''): 'description',
    URIRef(''): 'description',
    BIOLINK.has_evidence: 'has_evidence',
    URIRef(''): 'has_evidence',
    BIOLINK.synonym: 'synonym',
    URIRef(''): 'synonym',
    OWL.sameAs: 'same_as',
    OWL.equivalentClass: 'same_as',
    BIOLINK.in_taxon: 'in_taxon',
    URIRef(''): 'in_taxon',

is_property_multivalued = {
    'subject': False,
    'object': False,
    'edge_label': False,
    'description': False,
    'synonym': True,
    'in_taxon': False,
    'same_as': True,
    'name': False,
    'has_evidence': False,
    'provided_by': True,
    'category': True,
    'publications': True,
    'type': False,

[docs]def process_iri(iri:Union[str, URIRef]) -> str: """ Casts iri to a string, and then checks whether it maps to any pre-defined values. If so returns that value, otherwise converts that iri to a curie and returns. Parameters ---------- iri: Union[str, URIRef] IRI to process; can be a string or a rdflib.term.URIRef Returns ------- str A string corresponding to the IRI """ mappings = [ predicate_mapping, category_mapping, property_mapping, ] for mapping in mappings: for key, value in mapping.items(): if iri.lower() == key.lower(): return value return make_curie(iri)
OBO = Namespace('') top_level_terms = { OBO.term('CL_0000000'): 'cell', OBO.term('UBERON_0001062'): 'anatomical_entity', OBO.term('PATO_0000001'): 'quality', OBO.term('NCBITaxon_131567'): 'organism', OBO.term('CLO_0000031'): 'cell_line', OBO.term('MONDO_0000001'): 'disease', OBO.term('CHEBI_23367'): 'molecular_entity', OBO.term('CHEBI_23888'): 'drug', OBO.term('UPHENO_0001001'): 'phenotypic_feature', OBO.term('GO_0008150'): 'biological_process', OBO.term('GO_0009987'): 'cellular_process', OBO.term('GO_0005575'): 'cellular_component', OBO.term('GO_0003674'): 'molecular_function', OBO.term('SO_0000704'): 'gene', OBO.term('GENO_0000002'): 'variant_locus', OBO.term('GENO_0000536'): 'genotype', OBO.term('SO_0000110'): 'sequence_feature', OBO.term('ECO_0000000'): 'evidence', OBO.term('PW_0000001'): 'pathway', OBO.term('IAO_0000310'): 'publication', OBO.term('SO_0001483'): 'snv', OBO.term('GENO_0000871'): 'haplotype', OBO.term('SO_0001024'): 'haplotype', OBO.term('SO_0000340'): 'chromosome', OBO.term('SO_0000104'): 'protein', OBO.term('SO_0001500'): 'phenotypic_marker', OBO.term('SO_0000001'): 'region', OBO.term('HP_0032223'): 'blood_group', OBO.term('HP_0031797'): 'clinical_course', OBO.term('HP_0040279'): 'frequency', OBO.term('HP_0000118'): 'phenotypic_abnormality', OBO.term('HP_0032443'): 'past_medical_history', OBO.term('HP_0000005'): 'mode_of_inheritance', OBO.term('HP_0012823'): 'clinical_modifier' }
[docs]def infer_category(iri: URIRef, rdfgraph:rdflib.Graph) -> List[str]: """ Infer category for a given iri by traversing rdfgraph. Parameters ---------- iri: rdflib.term.URIRef IRI rdfgraph: rdflib.Graph A graph to traverse Returns ------- List[str] A list of category corresponding to the given IRI """ category = None subj = None closure = list(rdfgraph.transitive_objects(iri, URIRef(RDFS.subClassOf))) category = [top_level_terms[x] for x in closure if x in top_level_terms.keys()] if category: logging.debug("Inferred category as {} based on transitive closure over 'subClassOf' relation".format(category)) else: subj = closure[-1] if subj == iri: return category subject_curie = make_curie(subj) if '_' in subject_curie: fixed_curie = subject_curie.split(':', 1)[1].split('_', 1)[1] logging.warning("Malformed CURIE {} will be fixed to {}".format(subject_curie, fixed_curie)) subject_curie = fixed_curie cls = get_curie_lookup_service() category = get_category_via_superclass(cls.ontology_graph, subject_curie) return category