Source code for kgx.transformers.rdf_graph_mixin

import logging
import networkx as nx
from typing import List, Set, Dict, Tuple, Union
import rdflib
from rdflib import URIRef, Namespace

from kgx.utils.graph_utils import curie_lookup
from kgx.utils.rdf_utils import property_mapping, process_iri, make_curie, is_property_multivalued
from kgx.utils.kgx_utils import generate_edge_key
from prefixcommons.curie_util import read_remote_jsonld_context
from kgx.validator import is_curie

biolink_prefix_map = read_remote_jsonld_context('https://biolink.github.io/biolink-model/context.jsonld')


[docs]class RdfGraphMixin(object): """ A mixin that defines the following methods, - load_networkx_graph(): template method that all deriving classes should implement - add_node(): method to add a node from a RDF form to property graph form - add_node_attribute(): method to add a node attribute from a RDF form to property graph form - add_edge(): method to add an edge from a RDF form to property graph form - add_edge_attribute(): method to add an edge attribute from an RDF form to property graph form """ # TODO: use OBO IRI from biolink model context once https://github.com/biolink/biolink-model/issues/211 is resolved OBO = Namespace('http://purl.obolibrary.org/obo/') OBAN = Namespace(biolink_prefix_map['OBAN']) PMID = Namespace(biolink_prefix_map['PMID']) BIOLINK = Namespace('https://w3id.org/biolink/') DEFAULT_EDGE_LABEL = 'related_to' def __init__(self, source_graph: nx.MultiDiGraph = None): if source_graph: self.graph = source_graph else: self.graph = nx.MultiDiGraph() self.graph_metadata = {}
[docs] def load_networkx_graph(self, rdfgraph: rdflib.Graph = None, predicates: Set[URIRef] = None, **kwargs) -> None: """ This method should be overridden and be implemented by the derived class, and should load all desired nodes and edges from rdflib.Graph into networkx.MultiDiGraph Its preferred that this method does not use the networkx API directly when adding nodes, edges, and their attributes. Instead, Using the following methods, - ``add_node()`` - ``add_node_attribute()`` - ``add_edge()`` - ``add_edge_attribute()`` to ensure that nodes, edges, and their attributes are added in conformance with the BioLink Model, and that URIRef's are translated into CURIEs or BioLink Model elements whenever appropriate. Parameters ---------- rdfgraph: rdflib.Graph Graph containing nodes and edges predicates: list A list of rdflib.URIRef representing predicates to be loaded kwargs: dict Any additional arguments """ raise NotImplementedError("Method not implemented.")
[docs] def add_node(self, iri: URIRef) -> str: """ This method should be used by all derived classes when adding a node to the networkx.MultiDiGraph. This ensures that a node's identifier is a CURIE, and that it's `iri` property is set. Returns the CURIE identifier for the node in the networkx.MultiDiGraph Parameters ---------- iri : rdflib.URIRef IRI of a node Returns ------- str The CURIE identifier of a node """ kwargs = { 'iri': str(iri), } if 'provided_by' in self.graph_metadata: kwargs['provided_by'] = self.graph_metadata['provided_by'] n = make_curie(iri) if n not in self.graph: self.graph.add_node(n, **kwargs) return n
[docs] def add_edge(self, subject_iri: URIRef, object_iri: URIRef, predicate_iri: URIRef) -> Tuple[str, str, str]: """ This method should be used by all derived classes when adding an edge to the networkx.MultiDiGraph. This ensures that the `subject` and `object` identifiers are CURIEs, and that `edge_label` is in the correct form. Returns the CURIE identifiers used for the `subject` and `object` in the networkx.MultiDiGraph, and the processed `edge_label`. Parameters ---------- subject_iri: rdflib.URIRef Subject IRI for the subject in a triple object_iri: rdflib.URIRef Object IRI for the object in a triple predicate_iri: rdflib.URIRef Predicate IRI for the predicate in a triple Returns ------- Tuple[str, str, str] A 3-nary tuple (of the form subject, object, predicate) that represents the edge """ s = self.add_node(subject_iri) o = self.add_node(object_iri) relation = make_curie(predicate_iri) edge_label = process_iri(predicate_iri) if ' ' in edge_label: logging.debug("predicate IRI '{}' yields edge_label '{}' that not in snake_case form; replacing ' ' with '_'".format(predicate_iri, edge_label)) if edge_label.startswith(self.BIOLINK): logging.debug("predicate IRI '{}' yields edge_label '{}' that starts with '{}'; removing IRI prefix".format(predicate_iri, edge_label, self.BIOLINK)) edge_label = edge_label.replace(self.BIOLINK, '') if is_curie(edge_label): name = curie_lookup(edge_label) if name: logging.debug("predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; Using its mapping instead: {}".format(predicate_iri, edge_label, name)) edge_label = name else: logging.debug("predicate IRI '{}' yields edge_label '{}' that is actually a CURIE; defaulting back to {}".format(predicate_iri, edge_label, self.DEFAULT_EDGE_LABEL)) edge_label = self.DEFAULT_EDGE_LABEL kwargs = { 'subject': s, 'predicate': predicate_iri, 'object': o, 'relation': relation, 'edge_label': edge_label } if 'provided_by' in self.graph_metadata: kwargs['provided_by'] = self.graph_metadata['provided_by'] key = generate_edge_key(s, edge_label, o) if not self.graph.has_edge(s, o, key=key): self.graph.add_edge(s, o, key=key, **kwargs) return s, o, edge_label
[docs] def add_node_attribute(self, iri: Union[URIRef, str], key: str, value: str) -> None: """ Add an attribute to a node, while taking into account whether the attribute should be multi-valued. Multi-valued properties will not contain duplicates. The ``key`` may be a rdflib.URIRef or a URI string that maps onto a property name as defined in ``rdf_utils.property_mapping``. If the node does not exist then it is created using the given ``iri``. Parameters ---------- iri: Union[rdflib.URIRef, str] The IRI of a node in the rdflib.Graph key: str The name of the attribute. Can be a rdflib.URIRef or URI string value: str The value of the attribute """ if key.lower() in is_property_multivalued: key = key.lower() else: if not isinstance(key, URIRef): key = URIRef(key) key = property_mapping.get(key) if key is not None: n = self.add_node(iri) attr_dict = self.graph.nodes[n] self._add_attribute(attr_dict, key, value)
[docs] def add_edge_attribute(self, subject_iri: Union[URIRef, str], object_iri: URIRef, predicate_iri: URIRef, key: str, value: str) -> None: """ Adds an attribute to an edge, while taking into account whether the attribute should be multi-valued. Multi-valued properties will not contain duplicates. The ``key`` may be a rdflib.URIRef or a URI string that maps onto a property name as defined in ``rdf_utils.property_mapping``. If the nodes in the edge does not exist then they will be created using ``subject_iri`` and ``object_iri``. If the edge itself does not exist then it will be created using ``subject_iri``, ``object_iri`` and ``predicate_iri``. Parameters ---------- subject_iri: [rdflib.URIRef, str] The IRI of the subject node of an edge in rdflib.Graph object_iri: rdflib.URIRef The IRI of the object node of an edge in rdflib.Graph predicate_iri: rdflib.URIRef The IRI of the predicate representing an edge in rdflib.Graph key: str The name of the attribute. Can be a rdflib.URIRef or URI string value: str The value of the attribute """ if key.lower() in is_property_multivalued: key = key.lower() else: if not isinstance(key, URIRef): key = URIRef(key) key = property_mapping.get(key) if key is not None: subject_curie = make_curie(subject_iri) object_curie = make_curie(object_iri) edge_label = process_iri(predicate_iri) if is_curie(edge_label): edge_label = curie_lookup(edge_label) edge_key = generate_edge_key(subject_curie, edge_label, object_curie) attr_dict = self.graph.get_edge_data(subject_curie, object_curie, key=edge_key) self._add_attribute(attr_dict, key, value)
def _add_attribute(self, attr_dict: Dict, key: str, value: str) -> None: """ Adds an attribute to the attribute dictionary, respecting whether or not that attribute should be multi-valued. Multi-valued attributes will not contain duplicates. Some attributes are singular form of others. In such cases overflowing values will be placed into the correlating multi-valued attribute. For example, `name` attribute will hold only one value while any additional value will be stored as `synonym` attribute. Parameters ---------- attr_dict: dict Dictionary representing the attribute set of a node or an edge in a networkx graph key: str The name of the attribute value: str The value of the attribute """ if key is None or key not in is_property_multivalued: logging.warning("Discarding key {} as it is not a valid property.".format(key)) return value = make_curie(process_iri(value)) if is_property_multivalued[key]: if key not in attr_dict: attr_dict[key] = [value] elif value not in attr_dict[key]: attr_dict[key].append(value) else: if key not in attr_dict: attr_dict[key] = value elif key == 'name': self._add_attribute(attr_dict, 'synonym', value)