import logging
from typing import List, Union
import rdflib
from rdflib import Namespace, URIRef
from rdflib.namespace import RDF, RDFS, OWL
from prefixcommons.curie_util import expand_uri
from kgx.utils.graph_utils import get_category_via_superclass
from kgx.utils.kgx_utils import get_toolkit, get_curie_lookup_service, make_curie
toolkit = get_toolkit()
m = toolkit.generator.mappings
x = set()
mapping = {}
for key, value in m.items():
k = expand_uri(key)
v = toolkit.get_by_mapping(key)
if k == key:
x.add(key)
else:
mapping[k] = v
OBAN = Namespace('http://purl.org/oban/')
BIOLINK = Namespace('http://w3id.org/biolink/vocab/')
predicate_mapping = {
'http://purl.obolibrary.org/obo/RO_0002200': 'has_phenotype',
'http://purl.obolibrary.org/obo/RO_0000091': 'has_disposition',
'http://purl.obolibrary.org/obo/RO_0003303': 'causes_condition',
'http://purl.obolibrary.org/obo/RO_0002525': 'is_subsequence_of',
'http://purl.obolibrary.org/obo/RO_0002524': 'has_subsequence',
OWL.sameAs.lower(): 'same_as',
OWL.equivalentClass.lower(): 'same_as',
OWL.inverseOf.lower(): 'inverse_of',
RDFS.subClassOf.lower(): 'subclass_of',
RDFS.subPropertyOf.lower(): 'subproperty_of',
}
predicate_mapping.update(
{
'{}{}'.format(BIOLINK, n) : n
for n in
[x.replace(',', '').replace(' ', '_') for x in toolkit.descendents('related to')]
}
)
predicate_mapping.update(mapping)
# TODO: consolidate
category_mapping = {
# subclasses mapped onto their superclasses:
"http://purl.obolibrary.org/obo/SO_0000405": "sequence_feature",
"http://purl.obolibrary.org/obo/SO_0000001": "sequence_feature",
"http://purl.obolibrary.org/obo/SO_0000100": "sequence_feature",
"http://purl.obolibrary.org/obo/SO_0000336": "sequence_feature",
"http://purl.obolibrary.org/obo/SO_0000340": "sequence_feature",
"http://purl.obolibrary.org/obo/SO_0000404": "transcript",
"http://purl.obolibrary.org/obo/SO_0000460": "sequence_feature",
"http://purl.obolibrary.org/obo/SO_0000651": "transcript",
"http://purl.obolibrary.org/obo/SO_0000655": "transcript",
"http://purl.obolibrary.org/obo/SO_0001217": "gene",
"http://purl.obolibrary.org/obo/GENO_0000002": "sequence_variant",
'http://purl.obolibrary.org/obo/UPHENO_0001002': 'phenotypic_feature',
"http://purl.obolibrary.org/obo/CL_0000000": "cell",
"http://purl.obolibrary.org/obo/UBERON_0001062": "anatomical_entity",
"http://purl.obolibrary.org/obo/ZFA_0009000": "cell",
"http://purl.obolibrary.org/obo/UBERON_0004529": "anatomical_projection",
"http://purl.obolibrary.org/obo/UBERON_0000468": "multi_cellular_organism",
"http://purl.obolibrary.org/obo/UBERON_0000955": "brain",
"http://purl.obolibrary.org/obo/PATO_0000001": "quality",
"http://purl.obolibrary.org/obo/GO_0005623": "cell",
"http://purl.obolibrary.org/obo/WBbt_0007833": "organism",
"http://purl.obolibrary.org/obo/WBbt_0004017": "cell",
"http://purl.obolibrary.org/obo/MONDO_0000001": "disease",
"http://purl.obolibrary.org/obo/PATO_0000003": "assay",
"http://purl.obolibrary.org/obo/PATO_0000006": "process",
"http://purl.obolibrary.org/obo/PATO_0000011": "age",
"http://purl.obolibrary.org/obo/ZFA_0000008": "brain",
"http://purl.obolibrary.org/obo/ZFA_0001637": "bony_projection",
"http://purl.obolibrary.org/obo/WBPhenotype_0000061": "extended_life_span",
"http://purl.obolibrary.org/obo/WBPhenotype_0000039": "life_span_variant",
"http://purl.obolibrary.org/obo/WBPhenotype_0001171": "shortened_life_span",
"http://purl.obolibrary.org/obo/CHEBI_23367": "molecular_entity",
"http://purl.obolibrary.org/obo/CHEBI_23888": "drug",
"http://purl.obolibrary.org/obo/CHEBI_51086": "chemical_role",
"http://purl.obolibrary.org/obo/UPHENO_0001001": "phenotypic_feature",
"http://purl.obolibrary.org/obo/GO_0008150": "biological_process",
"http://purl.obolibrary.org/obo/GO_0005575": "cellular_component",
"http://purl.obolibrary.org/obo/SO_0000704": "gene",
"http://purl.obolibrary.org/obo/SO_0000110": "sequence_feature",
"http://purl.obolibrary.org/obo/GENO_0000536": "genotype",
}
category_mapping.update(mapping)
category_mapping.update(
{
'{}{}'.format(BIOLINK, n.replace(',', '').title().replace(' ', '')): n for n in toolkit.descendents('named thing')
}
)
property_mapping = {
OBAN.association_has_subject: 'subject',
OBAN.association_has_object: 'object',
OBAN.association_has_predicate: 'predicate',
BIOLINK.name: 'name',
RDFS.label: 'name',
RDF.type: 'type',
URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 'type',
BIOLINK.description: 'description',
URIRef('http://purl.obolibrary.org/obo/IAO_0000115'): 'description',
URIRef('http://purl.org/dc/elements/1.1/description'): 'description',
BIOLINK.has_evidence: 'has_evidence',
URIRef('http://purl.obolibrary.org/obo/RO_0002558'): 'has_evidence',
BIOLINK.synonym: 'synonym',
URIRef('http://www.geneontology.org/formats/oboInOwl#hasExactSynonym'): 'synonym',
OWL.sameAs: 'same_as',
OWL.equivalentClass: 'same_as',
BIOLINK.in_taxon: 'in_taxon',
URIRef('http://purl.obolibrary.org/obo/RO_0002162'): 'in_taxon',
}
is_property_multivalued = {
'subject': False,
'object': False,
'edge_label': False,
'description': False,
'synonym': True,
'in_taxon': False,
'same_as': True,
'name': False,
'has_evidence': False,
'provided_by': True,
'category': True,
'publications': True,
'type': False,
}
[docs]def process_iri(iri:Union[str, URIRef]) -> str:
"""
Casts iri to a string, and then checks whether it maps to any pre-defined
values. If so returns that value, otherwise converts that iri to a curie
and returns.
Parameters
----------
iri: Union[str, URIRef]
IRI to process; can be a string or a rdflib.term.URIRef
Returns
-------
str
A string corresponding to the IRI
"""
mappings = [
predicate_mapping,
category_mapping,
property_mapping,
]
for mapping in mappings:
for key, value in mapping.items():
if iri.lower() == key.lower():
return value
return make_curie(iri)
OBO = Namespace('http://purl.obolibrary.org/obo/')
top_level_terms = {
OBO.term('CL_0000000'): 'cell',
OBO.term('UBERON_0001062'): 'anatomical_entity',
OBO.term('PATO_0000001'): 'quality',
OBO.term('NCBITaxon_131567'): 'organism',
OBO.term('CLO_0000031'): 'cell_line',
OBO.term('MONDO_0000001'): 'disease',
OBO.term('CHEBI_23367'): 'molecular_entity',
OBO.term('CHEBI_23888'): 'drug',
OBO.term('UPHENO_0001001'): 'phenotypic_feature',
OBO.term('GO_0008150'): 'biological_process',
OBO.term('GO_0009987'): 'cellular_process',
OBO.term('GO_0005575'): 'cellular_component',
OBO.term('GO_0003674'): 'molecular_function',
OBO.term('SO_0000704'): 'gene',
OBO.term('GENO_0000002'): 'variant_locus',
OBO.term('GENO_0000536'): 'genotype',
OBO.term('SO_0000110'): 'sequence_feature',
OBO.term('ECO_0000000'): 'evidence',
OBO.term('PW_0000001'): 'pathway',
OBO.term('IAO_0000310'): 'publication',
OBO.term('SO_0001483'): 'snv',
OBO.term('GENO_0000871'): 'haplotype',
OBO.term('SO_0001024'): 'haplotype',
OBO.term('SO_0000340'): 'chromosome',
OBO.term('SO_0000104'): 'protein',
OBO.term('SO_0001500'): 'phenotypic_marker',
OBO.term('SO_0000001'): 'region',
OBO.term('HP_0032223'): 'blood_group',
OBO.term('HP_0031797'): 'clinical_course',
OBO.term('HP_0040279'): 'frequency',
OBO.term('HP_0000118'): 'phenotypic_abnormality',
OBO.term('HP_0032443'): 'past_medical_history',
OBO.term('HP_0000005'): 'mode_of_inheritance',
OBO.term('HP_0012823'): 'clinical_modifier'
}
[docs]def infer_category(iri: URIRef, rdfgraph:rdflib.Graph) -> List[str]:
"""
Infer category for a given iri by traversing rdfgraph.
Parameters
----------
iri: rdflib.term.URIRef
IRI
rdfgraph: rdflib.Graph
A graph to traverse
Returns
-------
List[str]
A list of category corresponding to the given IRI
"""
category = None
subj = None
closure = list(rdfgraph.transitive_objects(iri, URIRef(RDFS.subClassOf)))
category = [top_level_terms[x] for x in closure if x in top_level_terms.keys()]
if category:
logging.debug("Inferred category as {} based on transitive closure over 'subClassOf' relation".format(category))
else:
subj = closure[-1]
if subj == iri:
return category
subject_curie = make_curie(subj)
if '_' in subject_curie:
fixed_curie = subject_curie.split(':', 1)[1].split('_', 1)[1]
logging.warning("Malformed CURIE {} will be fixed to {}".format(subject_curie, fixed_curie))
subject_curie = fixed_curie
cls = get_curie_lookup_service()
category = get_category_via_superclass(cls.ontology_graph, subject_curie)
return category