Source code for kgx.transformers.rdf_transformer
import click, rdflib, logging, os, uuid
import networkx as nx
from typing import Tuple, Union, Set, List, Dict
from rdflib import Namespace, URIRef
from rdflib.namespace import RDF, RDFS, OWL
from collections import defaultdict
from prefixcommons.curie_util import read_remote_jsonld_context
from kgx.prefix_manager import PrefixManager
from kgx.transformers.transformer import Transformer
from kgx.transformers.rdf_graph_mixin import RdfGraphMixin
from kgx.utils.rdf_utils import property_mapping, make_curie, infer_category
from kgx.utils.kgx_utils import get_toolkit
biolink_prefix_map = read_remote_jsonld_context('https://biolink.github.io/biolink-model/context.jsonld')
# TODO: use OBO IRI from biolink model context once https://github.com/biolink/biolink-model/issues/211 is resolved
OBO = Namespace('http://purl.obolibrary.org/obo/')
OBAN = Namespace(biolink_prefix_map['OBAN'])
PMID = Namespace(biolink_prefix_map['PMID'])
BIOLINK = Namespace(biolink_prefix_map['@vocab'])
DEFAULT_EDGE_LABEL = 'related_to'
[docs]class RdfTransformer(RdfGraphMixin, Transformer):
"""
Transformer that parses RDF and loads triples, as nodes and edges, into a networkx.MultiDiGraph
This is the base class which is used to implement other RDF-based transformers.
"""
OWL_PREDICATES = [RDFS.subClassOf, OWL.sameAs, OWL.equivalentClass]
is_about = URIRef('http://purl.obolibrary.org/obo/IAO_0000136')
has_subsequence = URIRef('http://purl.obolibrary.org/obo/RO_0002524')
is_subsequence_of = URIRef('http://purl.obolibrary.org/obo/RO_0002525')
def __init__(self, source_graph: nx.MultiDiGraph = None):
super().__init__(source_graph)
self.ontologies = []
self.prefix_manager = PrefixManager()
self.toolkit = get_toolkit()
[docs] def parse(self, filename: str = None, input_format: str = None, provided_by: str = None, predicates: Set[URIRef] = None) -> None:
"""
Parse a file, containing triples, into a rdflib.Graph
The file can be either a 'turtle' file or any other format supported by rdflib.
Parameters
----------
filename : str
File to read from.
input_format : str
The input file format.
If ``None`` is provided then the format is guessed using ``rdflib.util.guess_format()``
provided_by : str
Define the source providing the input file.
"""
rdfgraph = rdflib.Graph()
if input_format is None:
input_format = rdflib.util.guess_format(filename)
logging.info("Parsing {} with '{}' format".format(filename, input_format))
rdfgraph.parse(filename, format=input_format)
logging.info("{} parsed with {} triples".format(filename, len(rdfgraph)))
# TODO: use source from RDF
if provided_by:
self.graph_metadata['provided_by'] = [provided_by]
else:
if isinstance(filename, str):
self.graph_metadata['provided_by'] = [os.path.basename(filename)]
elif hasattr(filename, 'name'):
self.graph_metadata['provided_by'] = [filename.name]
self.load_networkx_graph(rdfgraph, predicates)
self.load_node_attributes(rdfgraph)
self.report()
[docs] def add_ontology(self, file: str) -> None:
"""
Load an ontology OWL into a Rdflib.Graph
# TODO: is there better way of pre-loading required ontologies?
"""
ont = rdflib.Graph()
logging.info("Parsing {}".format(file))
ont.parse(file, format=rdflib.util.guess_format(file))
self.ontologies.append(ont)
logging.info("{} parsed with {} triples".format(file, len(ont)))
[docs] def load_networkx_graph(self, rdfgraph: rdflib.Graph = None, predicates: Set[URIRef] = None, **kwargs) -> None:
"""
Walk through the rdflib.Graph and load all required triples into networkx.MultiDiGraph
By default this method loads the following predicates,
- ``RDFS.subClassOf``
- ``OWL.sameAs``
- ``OWL.equivalentClass``
- ``is_about`` (IAO:0000136)
- ``has_subsequence`` (RO:0002524)
- ``is_subsequence_of`` (RO:0002525)
This behavior can be overridden by providing a list of rdflib.URIRef that ought to be loaded
via the ``predicates`` parameter.
Parameters
----------
rdfgraph: rdflib.Graph
Graph containing nodes and edges
predicates: list
A list of rdflib.URIRef representing predicates to be loaded
kwargs: dict
Any additional arguments
"""
if predicates is None:
predicates = set()
predicates = predicates.union(self.OWL_PREDICATES, [self.is_about, self.is_subsequence_of, self.has_subsequence])
triples = rdfgraph.triples((None, None, None))
logging.info("Loading from rdflib.Graph to networkx.MultiDiGraph")
with click.progressbar(list(triples), label='Progress') as bar:
for s, p, o in bar:
if (p == self.is_about) and (p in predicates):
logging.debug("Loading is_about predicate")
# if predicate is 'is_about' then treat object as publication
self.add_node_attribute(o, key=s, value='publications')
elif (p == self.is_subsequence_of) and (p in predicates):
logging.debug("Loading is_subsequence_of predicate")
# if predicate is 'is_subsequence_of'
self.add_edge(s, o, self.is_subsequence_of)
elif (p == self.has_subsequence) and (p in predicates):
logging.debug("Loading has_subsequence predicate")
# if predicate is 'has_subsequence', interpret the inverse relation 'is_subsequence_of'
self.add_edge(o, s, self.is_subsequence_of)
elif any(p.lower() == x.lower() for x in predicates):
logging.debug("Loading {} predicate".format(p))
self.add_edge(s, o, p)
[docs] def load_node_attributes(self, rdfgraph: rdflib.Graph) -> None:
"""
This method loads the properties of nodes into networkx.MultiDiGraph
As there can be many values for a single key, all properties are lists by default.
This method assumes that ``RdfTransformer.load_edges()`` has been called, and that all nodes
have had their IRI as an attribute.
Parameters
----------
rdfgraph: rdflib.Graph
Graph containing nodes and edges
"""
logging.info("Loading node attributes from rdflib.Graph into networkx.MultiDiGraph")
with click.progressbar(self.graph.nodes(data=True), label='Progress') as bar:
for n, data in bar:
if 'id' not in data:
data['id'] = n
if 'iri' in data:
uriref = URIRef(data['iri'])
else:
provided_by = self.graph_metadata.get('provided_by')
logging.warning("No 'iri' property for {} provided by {}".format(n, provided_by))
continue
for s, p, o in rdfgraph.triples((uriref, None, None)):
if p in property_mapping:
# predicate corresponds to a property on subject
if not (isinstance(s, rdflib.term.BNode) and isinstance(o, rdflib.term.BNode)):
# neither subject nor object is a BNode
if isinstance(o, rdflib.term.Literal):
o = o.value
self.add_node_attribute(uriref, key=p, value=o)
elif isinstance(o, rdflib.term.Literal):
# object is a Literal
# i.e. predicate corresponds to a property on subject
self.add_node_attribute(uriref, key=p, value=o.value)
categories = infer_category(uriref, rdfgraph)
logging.debug("Inferred '{}' as category for node '{}'".format(categories, uriref))
for category in categories:
self.add_node_attribute(uriref, key='category', value=category)
[docs]class ObanRdfTransformer(RdfTransformer):
"""
Transformer that parses a 'turtle' file and loads triples, as nodes and edges, into a networkx.MultiDiGraph
This Transformer supports OBAN style of modeling where,
- it dereifies OBAN.association triples into a property graph form
- it reifies property graph into OBAN.association triples
"""
[docs] def load_networkx_graph(self, rdfgraph: rdflib.Graph = None, predicates: Set[URIRef] = None, **kwargs) -> None:
"""
Walk through the rdflib.Graph and load all triples into networkx.MultiDiGraph
Parameters
----------
rdfgraph: rdflib.Graph
Graph containing nodes and edges
predicates: list
A list of rdflib.URIRef representing predicates to be loaded
kwargs: dict
Any additional arguments
"""
if not predicates:
predicates = set()
predicates = predicates.union(self.OWL_PREDICATES)
for rel in predicates:
triples = rdfgraph.triples((None, rel, None))
with click.progressbar(list(triples), label="Loading relation '{}'".format(rel)) as bar:
for s, p, o in bar:
if not (isinstance(s, rdflib.term.BNode) and isinstance(o, rdflib.term.BNode)):
self.add_edge(s, o, p)
# get all OBAN.associations
associations = rdfgraph.subjects(RDF.type, OBAN.association)
logging.info("Loading from rdflib.Graph into networkx.MultiDiGraph")
with click.progressbar(list(associations), label='Progress') as bar:
for association in bar:
edge_attr = defaultdict(list)
edge_attr['id'].append(str(association))
# dereify OBAN.association
subject = None
object = None
predicate = None
# get all triples for association
for s, p, o in rdfgraph.triples((association, None, None)):
if o.startswith(PMID):
edge_attr['publications'].append(o)
if p in property_mapping or isinstance(o, rdflib.term.Literal):
p = property_mapping.get(p, p)
if p == 'subject':
subject = o
elif p == 'object':
object = o
elif p == 'predicate':
predicate = o
else:
edge_attr[p].append(o)
if predicate is None:
logging.warning("No 'predicate' for OBAN.association {}; defaulting to '{}'".format(association, self.DEFAULT_EDGE_LABEL))
predicate = DEFAULT_EDGE_LABEL
if subject and object:
self.add_edge(subject, object, predicate)
for key, values in edge_attr.items():
for value in values:
self.add_edge_attribute(subject, object, predicate, key=key, value=value)
[docs] def uriref(self, identifier: str) -> URIRef:
"""
Generate a rdflib.URIRef for a given string.
Parameters
----------
identifier: str
Identifier as string.
Returns
-------
rdflib.URIRef
URIRef form of the input ``identifier``
"""
if identifier in property_mapping:
uri = property_mapping[identifier]
else:
uri = self.prefix_manager.expand(identifier)
return URIRef(uri)
[docs] def save_attribute(self, rdfgraph: rdflib.Graph, object_iri: URIRef, key: str, value: Union[List[str], str]) -> None:
"""
Saves a node or edge attributes from networkx.MultiDiGraph into rdflib.Graph
Intended to be used within `ObanRdfTransformer.save()`.
Parameters
----------
rdfgraph: rdflib.Graph
Graph containing nodes and edges
object_iri: rdflib.URIRef
IRI of an object in the graph
key: str
The name of the attribute
value: Union[List[str], str]
The value of the attribute; Can be either a List or just a string
"""
element = self.toolkit.get_element(key)
if element is None:
return
if element.is_a == 'association slot' or element.is_a == 'node property':
if key in property_mapping:
key = property_mapping[key]
else:
key = URIRef('{}{}'.format(BIOLINK, element.name.replace(' ', '_')))
if not isinstance(value, (list, tuple, set)):
value = [value]
for value in value:
if element.range == 'iri type':
value = URIRef('{}{}'.format(BIOLINK, ''.join(value.title().split(' '))))
rdfgraph.add((object_iri, key, rdflib.term.Literal(value)))
[docs] def save(self, filename: str = None, output_format: str = "turtle", **kwargs) -> None:
"""
Transform networkx.MultiDiGraph into rdflib.Graph that follow OBAN-style reification and export
this graph as a file (``turtle``, by default).
Parameters
----------
filename: str
Filename to write to
output_format: str
The output format; default: ``turtle``
kwargs: dict
Any additional arguments
"""
# Make a new rdflib.Graph() instance to generate RDF triples
rdfgraph = rdflib.Graph()
# Register OBAN URL prefix (http://purl.org/oban/) as `OBAN` in the namespace.
rdfgraph.bind('OBAN', str(OBAN))
# <http://purl.obolibrary.org/obo/RO_0002558> is currently stored as OBO:RO_0002558 rather than RO:0002558
# because of the bug in rdflib. See https://github.com/RDFLib/rdflib/issues/632
rdfgraph.bind('OBO', str(OBO))
rdfgraph.bind('biolink', str(BIOLINK))
# saving all nodes
for n, data in self.graph.nodes(data=True):
if 'iri' not in n:
uriRef = self.uriref(n)
else:
uriRef = URIRef(data['iri'])
for key, value in data.items():
if key not in ['id', 'iri']:
self.save_attribute(rdfgraph, uriRef, key=key, value=value)
# saving all edges
for u, v, data in self.graph.edges(data=True):
if 'relation' not in data:
raise Exception('Relation is a required edge property in the biolink model, edge {} --> {}'.format(u, v))
if 'id' in data and data['id'] is not None:
assoc_id = URIRef(data['id'])
else:
# generating a UUID for association
assoc_id = URIRef('urn:uuid:{}'.format(uuid.uuid4()))
rdfgraph.add((assoc_id, RDF.type, OBAN.association))
rdfgraph.add((assoc_id, OBAN.association_has_subject, self.uriref(u)))
rdfgraph.add((assoc_id, OBAN.association_has_predicate, self.uriref(data['relation'])))
rdfgraph.add((assoc_id, OBAN.association_has_object, self.uriref(v)))
for key, value in data.items():
if key not in ['subject', 'relation', 'object']:
self.save_attribute(rdfgraph, assoc_id, key=key, value=value)
# Serialize the graph into the file.
rdfgraph.serialize(destination=filename, format=output_format)
[docs]class RdfOwlTransformer(RdfTransformer):
"""
Transformer that parses an OWL ontology in RDF, while retaining class-class relationships.
"""
[docs] def load_networkx_graph(self, rdfgraph: rdflib.Graph = None, predicates: Set[URIRef] = None, **kwargs) -> None:
"""
Walk through the rdflib.Graph and load all triples into networkx.MultiDiGraph
Parameters
----------
rdfgraph: rdflib.Graph
Graph containing nodes and edges
predicates: list
A list of rdflib.URIRef representing predicates to be loaded
kwargs: dict
Any additional arguments
"""
triples = rdfgraph.triples((None, RDFS.subClassOf, None))
logging.info("Loading from rdflib.Graph to networkx.MultiDiGraph")
with click.progressbar(list(triples), label='Progress') as bar:
for s, p, o in bar:
# ignoring blank nodes
if isinstance(s, rdflib.term.BNode):
continue
pred = None
parent = None
# TODO: does this block load all relevant bits from an OWL?
if isinstance(o, rdflib.term.BNode):
# C SubClassOf R some D
for x in rdfgraph.objects(o, OWL.onProperty):
pred = x
for x in rdfgraph.objects(o, OWL.someValuesFrom):
parent = x
if pred is None or parent is None:
logging.warning("Do not know how to handle BNode: {}".format(o))
continue
else:
# C SubClassOf D (C and D are named classes)
pred = p
parent = o
self.add_edge(s, parent, pred)
relations = rdfgraph.subjects(RDF.type, OWL.ObjectProperty)
logging.debug("Loading relations")
with click.progressbar(relations, label='Progress') as bar:
for relation in bar:
for _, p, o in rdfgraph.triples((relation, None, None)):
if o.startswith('http://purl.obolibrary.org/obo/RO_'):
self.add_edge(relation, o, p)
else:
self.add_node_attribute(relation, key=p, value=o)
self.add_node_attribute(relation, key='category', value='relation')