Skip to content

inferring linked data from IPython run times.¤

automatically exporting rdf data during interactive computing.

we'll modify IPythons display formatter to include a system that describes python objects as json linked data.

    from functools import singledispatch, singledispatchmethod
    import types, gc, re, sys
    from IPython.core.formatters import DisplayFormatter, BaseFormatter, catch_format_error, JSONFormatter
    from traitlets import ObjectName, Unicode, Instance, List, Any
    from IPython import get_ipython
    from pathlib import Path

    TYPE, ID, GRAPH, CONTAINER, NEST, CONTEXT = "@type @id @graph @container @nest @context".split()
    MAIN = __name__ == "__main__"
    ACTIVE = "__file__" not in locals() and MAIN
    shell = get_ipython()

a MetadataFormatter for including linked data in IPython reprs. this class carries machinery to generate:

  • ids for python types and objects as urns MetadataFormatter.get_id
  • linked data representations of python objects with MetadataFormatter.get_graph

MetadataFormatter.for_type, MetadataFormatter.get_id.register, MetadataFormatter.get_graph.register extra the expression of the linked data graphs.

custom metadata formatter¤

    class MetadataFormatter(BaseFormatter):
        graph, format_type,  = List(), Unicode('application/ld+json') 
        _return_type, print_method = (list, dict), ObjectName('_repr_metadata_')

        @singledispatchmethod
        def get_id(self, object): return next(self.get_object(object), None)

        @singledispatchmethod
        def get_graph(self, object):
            data = {TYPE: self.get_id(type(object))}
            id = self.get_id(object)
            if id: 
                data.setdefault(ID, id)
                if isinstance(id, list): return [{ID: x, **data} for x in id]
            return data

        def get_object(self, object, filter=None):
            if isinstance(filter, str): filter = re.compile(filter)
            for referrer in (x for x in gc.get_referrers(object) if isinstance(x, dict)):
                yield from self.get_object_from_ns(referrer, object, filter=filter)

        def get_object_from_ns(self, ns, object, filter=None):
            weakref = ns.get("__weakref__")
            parent = None
            if weakref: parent = self.get_id(weakref.__objclass__)
            else:
                parent = ns.get("__module__", ns.get("__name__"))
                if parent: parent += ":"

            if not parent and ns is sys.modules:
                return object.__name__

            for k in (k for k, v in ns.items() if v is object and not k.startswith("_")): 
                name = F"{parent or 'noparent'}#{k}"
                if filter is not None and not filter.match(name): continue
                yield name

        def get_session_cell_id(self):
            data = get_ipython().kernel.get_parent()
            return data["metadata"]["cellId"], data["header"]["session"]

        def set_metadata(self, object=None, **kwargs):
            ids = dict(zip(("cell:id", "session:id"), self.get_session_cell_id()))
            node = {} if object is None else self.get_graph(object)
            if isinstance(node, dict): node = [node]
            for node in node:
                self.graph.append({**ids, **node, **kwargs})


        def __call__(self, object):
            explicit = super().__call__(object)
            if explicit:
                if isinstance(explicit, dict): self.set_metadata(**explicit)
                else: 
                    for e in explicit: self.set_metadata(**w)
            else: self.set_metadata(object)
            try: return self.graph[:]
            finally: self.graph.clear()

custom display formatter¤

the LinkedDataFormatter customizes how IPythons normal DisplayFormatter expresses metadata.

    class LinkedDataFormatter(DisplayFormatter):
        metadata_formatter = Instance(MetadataFormatter, args=())
        def format(self, object, include=None, exclude=None):
            data, meta = super().format(object, include, exclude)
            g = self.metadata_formatter(object)
            if g: meta[GRAPH] = g
            return data, meta

    def load_ipython_extension(shell=get_ipython()):
        shell.display_formatter = LinkedDataFormatter(**shell.display_formatter._trait_values)
        shell.user_ns["set_metadata"] = shell.display_formatter.metadata_formatter.set_metadata

    def unload_ipython_extension(shell=get_ipython()):
        shell.display_formatter = DisplayFormatter(**shell.display_formatter._trait_values)

extend how the graph is generated for tuples and strings as examples.

    @MetadataFormatter.get_graph.register(tuple)
    def get_graph_tuple(self, object): return list(map(self.get_graph, object))

register a different id for modules. we use their namespaces for expansion later.

    @MetadataFormatter.get_id.register(types.ModuleType)
    def get_name(self, object): return object.__name__

activate the display formatter

    ACTIVE and load_ipython_extension()

some data for the graph¤

dataframes¤

create a custom graph expression for pandas.DataFrames

    import pandas

    if ACTIVE:
        shell.display_formatter.metadata_formatter.get_graph.register(pandas.DataFrame
        )(lambda s, x: {ID: s.get_id(x), TYPE: s.get_id(type(x)), "pandas.DataFrame:shape": list(x.shape)})
    if ACTIVE:
        import pandas
        df = pandas.DataFrame()
        display((df, pandas, pandas.DataFrame))
(Empty DataFrame
 Columns: []
 Index: [],
 <module 'pandas' from '/home/tbone/mambaforge/lib/python3.9/site-packages/pandas/__init__.py'>,
 pandas.core.frame.DataFrame)

string or url¤

if there is a url hidden in a string we can elevate that as metadata thereby linked it to a cell.

for example, this work revists https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1 so we'll include that in the graph.

    @MetadataFormatter.get_id.register(str)
    def get_graph_str(self, object): 
        from urllib.parse import urlparse
        parsed = urlparse(object)
        if parsed.scheme:
            return object
    "https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1"
'https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1'

looking at the metadata graph¤

our choice of "@id" and "@type" are jsonld conventions. through these conventions we can surface the metadata by creating a jsonld context. we can ensure a consistent structure of the notebook and thereby context.

    ctx = {
        "cells": {
            ID: "nb:cell", CONTAINER: "@list", 
            CONTEXT: {
                "outputs": {CONTEXT: {"metadata": NEST}, ID: "cell:metadata"},
                "id": {ID: "cell:id", TYPE: ID},
                "cell_type": "cell:type",
                "metadata": {
                    ID: "cell:metadata",
                    CONTAINER: GRAPH,
                    CONTEXT: {
                        "tags": "rdf:name"
                    }
                },

            }
        },
        "@version": 1.1}
    if ACTIVE:
        file = Path("2022-10-29-metadata-formatter.ipynb")
        data = __import__("json").loads(file.read_text())
    if ACTIVE:
        from pyld import jsonld
        from IPython.display import JSON
        from_local = jsonld.compact(data, {}, options=dict(expandContext=ctx))
        set_metadata(from_local, **{"rdf:description": "all of the things we can expand from the notebook metadata."})
        display(from_local)
{'nb:cell': {'@list': [{'cell:id': {'@id': 'b89d004a-233f-452e-9da7-84604f291174'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '1734421e-0762-47e6-8f5f-6f3468bf7b2f'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '2f97b27e-f363-4dce-a2d5-45927c92feba'},
    'cell:metadata': {'@graph': {'rdf:name': ['imports', 'constants']}},
    'cell:type': 'code'},
   {'cell:id': {'@id': 'd2ed9bc9-c426-4bb4-98fb-bf4f3d26c299'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '1ea5f698-7263-4dc2-b24f-232e9f1dcfcd'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'cec19c6c-affe-4378-99d9-c927f8d2e726'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '55478faa-bcc6-4447-9e28-607b2b6bafa6'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '58291cbb-ce73-4bfa-b7ea-4c989090666a'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '2069ae29-068c-4f0b-ab52-68ea31169358'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '12548308-caf8-4760-8650-acc8fb13fa6d'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '0aa91bd4-ef7e-4447-8572-f8e5db6af7fd'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '22717127-56b3-4e8d-833e-21ba302cab49'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '131bd548-e951-4a6e-b759-c7630bc6e20e'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '4dc89ef5-0fd5-4510-9399-2a45ab27bef3'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'f121d061-df1c-4a41-8dac-de2ffabc54db'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '01efe602-d5c1-4945-a3f7-fdae95ac4349'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '50d9dcf9-18f4-4eb7-9a9a-cc3e14b3b2e2'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'f1227a1b-dbd7-4923-9f04-6b7db2e9828f'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '37e64bd4-d119-470b-b480-44778d33c549'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0'},
    'cell:metadata': [{'@graph': {}},
     {'@graph': [{'@id': '__main__:#df',
        '@type': 'pandas.io.xml:#DataFrame',
        'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
        'pandas.DataFrame:shape': [0, 0],
        'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
       {'@id': 'pandas',
        '@type': 'types:#ModuleType',
        'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
        'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
       {'@id': 'pandas.io.xml:#DataFrame',
        '@type': 'builtins:#type',
        'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
        'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}]}],
    'cell:type': 'code'},
   {'cell:id': {'@id': 'e3dc5bdb-b499-4309-aa79-c0c333f97d56'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'db25096c-b1bb-40a1-9330-e58c96f77a6c'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'bc71a7c4-c872-454d-8616-64a741dee4f0'},
    'cell:metadata': [{'@graph': {}},
     {'@graph': {'@id': 'https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1',
       '@type': 'builtins:#str',
       'cell:id': 'bc71a7c4-c872-454d-8616-64a741dee4f0',
       'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}}],
    'cell:type': 'code'},
   {'cell:id': {'@id': '67ce52b5-20b8-4adf-8518-6368d3c24303'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'c4377aed-7bb8-4fa1-8dd9-518ef22511a9'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '24ee56a0-829c-4be6-8194-88e648e6ac11'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e'},
    'cell:metadata': [{'@graph': {}},
     {'@graph': [{'@id': '__main__:#from_local',
        '@type': 'builtins:#dict',
        'cell:id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e',
        'rdf:description': 'all of the things we can expand from the notebook metadata.',
        'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
       {'@id': '__main__:#from_local',
        '@type': 'builtins:#dict',
        'cell:id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e',
        'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}]}],
    'cell:type': 'code'},
   {'cell:id': {'@id': '31d8a531-2334-4243-9c78-25137ce58637'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '55ba1060-beca-4342-a780-db7fd9c8b5ac'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '3e3d2b0b-81b5-4193-82d6-ec27bf37eaa2'},
    'cell:metadata': [{'@graph': {}},
     {'@graph': {'@id': '__main__:#from_remote',
       '@type': 'builtins:#dict',
       'cell:id': '3e3d2b0b-81b5-4193-82d6-ec27bf37eaa2',
       'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}}],
    'cell:type': 'code'},
   {'cell:id': {'@id': 'aa2688dd-5237-41ec-981a-7e9b8e2a67b0'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'b461d781-4ec6-4d4d-a49e-251e908421f5'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '30103d50-0741-4a56-95e1-d49c880ef6c3'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'}]}}

when the post is published we can condense the notation.

    remote = "https://raw.githubusercontent.com/tonyfast/tonyfast/main/tonyfast/xxii/oct/2022-10-29-metadata-formatter.ipynb"; remote
'https://raw.githubusercontent.com/tonyfast/tonyfast/main/tonyfast/xxii/oct/2022-10-29-metadata-formatter.ipynb'
    if ACTIVE:
        from_remote = jsonld.compact(
            remote, {},
            options=dict(expandContext=ctx)
        )
        display(from_remote)
{'nb:cell': {'@list': [{'cell:id': {'@id': 'b89d004a-233f-452e-9da7-84604f291174'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '1734421e-0762-47e6-8f5f-6f3468bf7b2f'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '2f97b27e-f363-4dce-a2d5-45927c92feba'},
    'cell:metadata': {'@graph': {'rdf:name': ['imports', 'constants']}},
    'cell:type': 'code'},
   {'cell:id': {'@id': 'd2ed9bc9-c426-4bb4-98fb-bf4f3d26c299'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '1ea5f698-7263-4dc2-b24f-232e9f1dcfcd'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'cec19c6c-affe-4378-99d9-c927f8d2e726'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '55478faa-bcc6-4447-9e28-607b2b6bafa6'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '58291cbb-ce73-4bfa-b7ea-4c989090666a'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '2069ae29-068c-4f0b-ab52-68ea31169358'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '12548308-caf8-4760-8650-acc8fb13fa6d'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '0aa91bd4-ef7e-4447-8572-f8e5db6af7fd'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '22717127-56b3-4e8d-833e-21ba302cab49'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '131bd548-e951-4a6e-b759-c7630bc6e20e'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '4dc89ef5-0fd5-4510-9399-2a45ab27bef3'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'f121d061-df1c-4a41-8dac-de2ffabc54db'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '01efe602-d5c1-4945-a3f7-fdae95ac4349'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '50d9dcf9-18f4-4eb7-9a9a-cc3e14b3b2e2'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'f1227a1b-dbd7-4923-9f04-6b7db2e9828f'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '37e64bd4-d119-470b-b480-44778d33c549'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0'},
    'cell:metadata': [{'@graph': {}},
     {'@graph': [{'@id': '#df',
        '@type': 'pandas.io.xml:#DataFrame',
        'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
        'pandas.DataFrame:shape': [0, 0],
        'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
       {'@id': 'pandas',
        '@type': 'types:#ModuleType',
        'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
        'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
       {'@id': 'pandas.io.xml:#DataFrame',
        '@type': 'builtins:#type',
        'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
        'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}]}],
    'cell:type': 'code'},
   {'cell:id': {'@id': 'e3dc5bdb-b499-4309-aa79-c0c333f97d56'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'db25096c-b1bb-40a1-9330-e58c96f77a6c'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'bc71a7c4-c872-454d-8616-64a741dee4f0'},
    'cell:metadata': [{'@graph': {}},
     {'@graph': {'@id': 'https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1',
       '@type': 'builtins:#str',
       'cell:id': 'bc71a7c4-c872-454d-8616-64a741dee4f0',
       'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}}],
    'cell:type': 'code'},
   {'cell:id': {'@id': '67ce52b5-20b8-4adf-8518-6368d3c24303'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'c4377aed-7bb8-4fa1-8dd9-518ef22511a9'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '24ee56a0-829c-4be6-8194-88e648e6ac11'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e'},
    'cell:metadata': [{'@graph': {}},
     {'@graph': [{'@id': '#from_local',
        '@type': 'builtins:#dict',
        'cell:id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e',
        'rdf:description': 'all of the things we can expand from the notebook metadata.',
        'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
       {'@id': '#from_local',
        '@type': 'builtins:#dict',
        'cell:id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e',
        'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}]}],
    'cell:type': 'code'},
   {'cell:id': {'@id': '31d8a531-2334-4243-9c78-25137ce58637'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '55ba1060-beca-4342-a780-db7fd9c8b5ac'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'code'},
   {'cell:id': {'@id': '3e3d2b0b-81b5-4193-82d6-ec27bf37eaa2'},
    'cell:metadata': [{'@graph': {}},
     {'@graph': {'@id': '#from_remote',
       '@type': 'builtins:#dict',
       'cell:id': '3e3d2b0b-81b5-4193-82d6-ec27bf37eaa2',
       'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}}],
    'cell:type': 'code'},
   {'cell:id': {'@id': 'aa2688dd-5237-41ec-981a-7e9b8e2a67b0'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': 'b461d781-4ec6-4d4d-a49e-251e908421f5'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'},
   {'cell:id': {'@id': '30103d50-0741-4a56-95e1-d49c880ef6c3'},
    'cell:metadata': {'@graph': {}},
    'cell:type': 'markdown'}]}}

this notebook is certified to have metadata

things we capture¤

in this proof of concept we don't capture much, but we do expose machinery to test this concept further and extend.

we capture:

  • kernel session id which can verify the outputs are generated in the same session
  • each cell id that makes it possible link back to the source cells.
  • some python variable information.

things we can capture in the graph.¤

  • annotations are type to id mappings.
  • we could trace function calls
  • with could encode imports
  • we could capture variable assignment