inferring linked data from IPython
run times.
automatically exporting rdf data during interactive computing.
we'll modify IPython
s display formatter to include a system that describes
python objects as json
linked data.
from functools import singledispatch , singledispatchmethod
import types , gc , re , sys
from IPython.core.formatters import DisplayFormatter , BaseFormatter , catch_format_error , JSONFormatter
from traitlets import ObjectName , Unicode , Instance , List , Any
from IPython import get_ipython
from pathlib import Path
TYPE , ID , GRAPH , CONTAINER , NEST , CONTEXT = "@type @id @graph @container @nest @context" . split ()
MAIN = __name__ == "__main__"
ACTIVE = "__file__" not in locals () and MAIN
shell = get_ipython ()
a MetadataFormatter
for including linked data in IPython
reprs.
this class carries machinery to generate:
ids for python types and objects as urns MetadataFormatter.get_id
linked data representations of python objects with MetadataFormatter.get_graph
MetadataFormatter.for_type, MetadataFormatter.get_id.register, MetadataFormatter.get_graph.register
extra the expression of the linked data graphs.
class MetadataFormatter ( BaseFormatter ):
graph , format_type , = List (), Unicode ( 'application/ld+json' )
_return_type , print_method = ( list , dict ), ObjectName ( '_repr_metadata_' )
@singledispatchmethod
def get_id ( self , object ): return next ( self . get_object ( object ), None )
@singledispatchmethod
def get_graph ( self , object ):
data = { TYPE : self . get_id ( type ( object ))}
id = self . get_id ( object )
if id :
data . setdefault ( ID , id )
if isinstance ( id , list ): return [{ ID : x , ** data } for x in id ]
return data
def get_object ( self , object , filter = None ):
if isinstance ( filter , str ): filter = re . compile ( filter )
for referrer in ( x for x in gc . get_referrers ( object ) if isinstance ( x , dict )):
yield from self . get_object_from_ns ( referrer , object , filter = filter )
def get_object_from_ns ( self , ns , object , filter = None ):
weakref = ns . get ( "__weakref__" )
parent = None
if weakref : parent = self . get_id ( weakref . __objclass__ )
else :
parent = ns . get ( "__module__" , ns . get ( "__name__" ))
if parent : parent += ":"
if not parent and ns is sys . modules :
return object . __name__
for k in ( k for k , v in ns . items () if v is object and not k . startswith ( "_" )):
name = F " { parent or 'noparent' } # { k } "
if filter is not None and not filter . match ( name ): continue
yield name
def get_session_cell_id ( self ):
data = get_ipython () . kernel . get_parent ()
return data [ "metadata" ][ "cellId" ], data [ "header" ][ "session" ]
def set_metadata ( self , object = None , ** kwargs ):
ids = dict ( zip (( "cell:id" , "session:id" ), self . get_session_cell_id ()))
node = {} if object is None else self . get_graph ( object )
if isinstance ( node , dict ): node = [ node ]
for node in node :
self . graph . append ({ ** ids , ** node , ** kwargs })
def __call__ ( self , object ):
explicit = super () . __call__ ( object )
if explicit :
if isinstance ( explicit , dict ): self . set_metadata ( ** explicit )
else :
for e in explicit : self . set_metadata ( ** w )
else : self . set_metadata ( object )
try : return self . graph [:]
finally : self . graph . clear ()
the LinkedDataFormatter
customizes how IPython
s normal DisplayFormatter
expresses metadata.
class LinkedDataFormatter ( DisplayFormatter ):
metadata_formatter = Instance ( MetadataFormatter , args = ())
def format ( self , object , include = None , exclude = None ):
data , meta = super () . format ( object , include , exclude )
g = self . metadata_formatter ( object )
if g : meta [ GRAPH ] = g
return data , meta
def load_ipython_extension ( shell = get_ipython ()):
shell . display_formatter = LinkedDataFormatter ( ** shell . display_formatter . _trait_values )
shell . user_ns [ "set_metadata" ] = shell . display_formatter . metadata_formatter . set_metadata
def unload_ipython_extension ( shell = get_ipython ()):
shell . display_formatter = DisplayFormatter ( ** shell . display_formatter . _trait_values )
extend how the graph is generated for tuples and strings as examples.
@MetadataFormatter . get_graph . register ( tuple )
def get_graph_tuple ( self , object ): return list ( map ( self . get_graph , object ))
register a different id for modules. we use their namespaces for expansion later.
@MetadataFormatter . get_id . register ( types . ModuleType )
def get_name ( self , object ): return object . __name__
activate the display formatter
ACTIVE and load_ipython_extension ()
some data for the graph
dataframes
create a custom graph expression for pandas.DataFrame
s
import pandas
if ACTIVE :
shell . display_formatter . metadata_formatter . get_graph . register ( pandas . DataFrame
)( lambda s , x : { ID : s . get_id ( x ), TYPE : s . get_id ( type ( x )), "pandas.DataFrame:shape" : list ( x . shape )})
if ACTIVE :
import pandas
df = pandas . DataFrame ()
display (( df , pandas , pandas . DataFrame ))
(Empty DataFrame
Columns: []
Index: [],
<module 'pandas' from '/home/tbone/mambaforge/lib/python3.9/site-packages/pandas/__init__.py'>,
pandas.core.frame.DataFrame)
string or url
if there is a url hidden in a string we can elevate that as metadata
thereby linked it to a cell.
for example, this work revists https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1 so we'll include that in the graph.
@MetadataFormatter . get_id . register ( str )
def get_graph_str ( self , object ):
from urllib.parse import urlparse
parsed = urlparse ( object )
if parsed . scheme :
return object
"https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1"
'https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1'
our choice of "@id" and "@type" are jsonld conventions.
through these conventions we can surface the metadata by creating
a jsonld
context. we can ensure a consistent structure of the notebook
and thereby context.
ctx = {
"cells" : {
ID : "nb:cell" , CONTAINER : "@list" ,
CONTEXT : {
"outputs" : { CONTEXT : { "metadata" : NEST }, ID : "cell:metadata" },
"id" : { ID : "cell:id" , TYPE : ID },
"cell_type" : "cell:type" ,
"metadata" : {
ID : "cell:metadata" ,
CONTAINER : GRAPH ,
CONTEXT : {
"tags" : "rdf:name"
}
},
}
},
"@version" : 1.1 }
if ACTIVE :
file = Path ( "2022-10-29-metadata-formatter.ipynb" )
data = __import__ ( "json" ) . loads ( file . read_text ())
if ACTIVE :
from pyld import jsonld
from IPython.display import JSON
from_local = jsonld . compact ( data , {}, options = dict ( expandContext = ctx ))
set_metadata ( from_local , ** { "rdf:description" : "all of the things we can expand from the notebook metadata." })
display ( from_local )
{'nb:cell': {'@list': [{'cell:id': {'@id': 'b89d004a-233f-452e-9da7-84604f291174'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '1734421e-0762-47e6-8f5f-6f3468bf7b2f'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '2f97b27e-f363-4dce-a2d5-45927c92feba'},
'cell:metadata': {'@graph': {'rdf:name': ['imports', 'constants']}},
'cell:type': 'code'},
{'cell:id': {'@id': 'd2ed9bc9-c426-4bb4-98fb-bf4f3d26c299'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '1ea5f698-7263-4dc2-b24f-232e9f1dcfcd'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'cec19c6c-affe-4378-99d9-c927f8d2e726'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '55478faa-bcc6-4447-9e28-607b2b6bafa6'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '58291cbb-ce73-4bfa-b7ea-4c989090666a'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '2069ae29-068c-4f0b-ab52-68ea31169358'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '12548308-caf8-4760-8650-acc8fb13fa6d'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '0aa91bd4-ef7e-4447-8572-f8e5db6af7fd'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '22717127-56b3-4e8d-833e-21ba302cab49'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '131bd548-e951-4a6e-b759-c7630bc6e20e'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '4dc89ef5-0fd5-4510-9399-2a45ab27bef3'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'f121d061-df1c-4a41-8dac-de2ffabc54db'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '01efe602-d5c1-4945-a3f7-fdae95ac4349'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '50d9dcf9-18f4-4eb7-9a9a-cc3e14b3b2e2'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'f1227a1b-dbd7-4923-9f04-6b7db2e9828f'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '37e64bd4-d119-470b-b480-44778d33c549'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0'},
'cell:metadata': [{'@graph': {}},
{'@graph': [{'@id': '__main__:#df',
'@type': 'pandas.io.xml:#DataFrame',
'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
'pandas.DataFrame:shape': [0, 0],
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
{'@id': 'pandas',
'@type': 'types:#ModuleType',
'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
{'@id': 'pandas.io.xml:#DataFrame',
'@type': 'builtins:#type',
'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}]}],
'cell:type': 'code'},
{'cell:id': {'@id': 'e3dc5bdb-b499-4309-aa79-c0c333f97d56'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'db25096c-b1bb-40a1-9330-e58c96f77a6c'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'bc71a7c4-c872-454d-8616-64a741dee4f0'},
'cell:metadata': [{'@graph': {}},
{'@graph': {'@id': 'https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1',
'@type': 'builtins:#str',
'cell:id': 'bc71a7c4-c872-454d-8616-64a741dee4f0',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}}],
'cell:type': 'code'},
{'cell:id': {'@id': '67ce52b5-20b8-4adf-8518-6368d3c24303'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'c4377aed-7bb8-4fa1-8dd9-518ef22511a9'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '24ee56a0-829c-4be6-8194-88e648e6ac11'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e'},
'cell:metadata': [{'@graph': {}},
{'@graph': [{'@id': '__main__:#from_local',
'@type': 'builtins:#dict',
'cell:id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e',
'rdf:description': 'all of the things we can expand from the notebook metadata.',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
{'@id': '__main__:#from_local',
'@type': 'builtins:#dict',
'cell:id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}]}],
'cell:type': 'code'},
{'cell:id': {'@id': '31d8a531-2334-4243-9c78-25137ce58637'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '55ba1060-beca-4342-a780-db7fd9c8b5ac'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '3e3d2b0b-81b5-4193-82d6-ec27bf37eaa2'},
'cell:metadata': [{'@graph': {}},
{'@graph': {'@id': '__main__:#from_remote',
'@type': 'builtins:#dict',
'cell:id': '3e3d2b0b-81b5-4193-82d6-ec27bf37eaa2',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}}],
'cell:type': 'code'},
{'cell:id': {'@id': 'aa2688dd-5237-41ec-981a-7e9b8e2a67b0'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'b461d781-4ec6-4d4d-a49e-251e908421f5'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '30103d50-0741-4a56-95e1-d49c880ef6c3'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'}]}}
when the post is published we can condense the notation.
remote = "https://raw.githubusercontent.com/tonyfast/tonyfast/main/tonyfast/xxii/oct/2022-10-29-metadata-formatter.ipynb" ; remote
'https://raw.githubusercontent.com/tonyfast/tonyfast/main/tonyfast/xxii/oct/2022-10-29-metadata-formatter.ipynb'
if ACTIVE :
from_remote = jsonld . compact (
remote , {},
options = dict ( expandContext = ctx )
)
display ( from_remote )
{'nb:cell': {'@list': [{'cell:id': {'@id': 'b89d004a-233f-452e-9da7-84604f291174'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '1734421e-0762-47e6-8f5f-6f3468bf7b2f'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '2f97b27e-f363-4dce-a2d5-45927c92feba'},
'cell:metadata': {'@graph': {'rdf:name': ['imports', 'constants']}},
'cell:type': 'code'},
{'cell:id': {'@id': 'd2ed9bc9-c426-4bb4-98fb-bf4f3d26c299'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '1ea5f698-7263-4dc2-b24f-232e9f1dcfcd'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'cec19c6c-affe-4378-99d9-c927f8d2e726'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '55478faa-bcc6-4447-9e28-607b2b6bafa6'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '58291cbb-ce73-4bfa-b7ea-4c989090666a'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '2069ae29-068c-4f0b-ab52-68ea31169358'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '12548308-caf8-4760-8650-acc8fb13fa6d'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '0aa91bd4-ef7e-4447-8572-f8e5db6af7fd'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '22717127-56b3-4e8d-833e-21ba302cab49'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '131bd548-e951-4a6e-b759-c7630bc6e20e'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '4dc89ef5-0fd5-4510-9399-2a45ab27bef3'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'f121d061-df1c-4a41-8dac-de2ffabc54db'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '01efe602-d5c1-4945-a3f7-fdae95ac4349'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '50d9dcf9-18f4-4eb7-9a9a-cc3e14b3b2e2'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'f1227a1b-dbd7-4923-9f04-6b7db2e9828f'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '37e64bd4-d119-470b-b480-44778d33c549'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0'},
'cell:metadata': [{'@graph': {}},
{'@graph': [{'@id': '#df',
'@type': 'pandas.io.xml:#DataFrame',
'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
'pandas.DataFrame:shape': [0, 0],
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
{'@id': 'pandas',
'@type': 'types:#ModuleType',
'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
{'@id': 'pandas.io.xml:#DataFrame',
'@type': 'builtins:#type',
'cell:id': 'e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}]}],
'cell:type': 'code'},
{'cell:id': {'@id': 'e3dc5bdb-b499-4309-aa79-c0c333f97d56'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'db25096c-b1bb-40a1-9330-e58c96f77a6c'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'bc71a7c4-c872-454d-8616-64a741dee4f0'},
'cell:metadata': [{'@graph': {}},
{'@graph': {'@id': 'https://nbviewer.org/gist/tonyfast/16d3bc82d69890949212b46040bd86e1',
'@type': 'builtins:#str',
'cell:id': 'bc71a7c4-c872-454d-8616-64a741dee4f0',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}}],
'cell:type': 'code'},
{'cell:id': {'@id': '67ce52b5-20b8-4adf-8518-6368d3c24303'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'c4377aed-7bb8-4fa1-8dd9-518ef22511a9'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '24ee56a0-829c-4be6-8194-88e648e6ac11'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e'},
'cell:metadata': [{'@graph': {}},
{'@graph': [{'@id': '#from_local',
'@type': 'builtins:#dict',
'cell:id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e',
'rdf:description': 'all of the things we can expand from the notebook metadata.',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'},
{'@id': '#from_local',
'@type': 'builtins:#dict',
'cell:id': 'fc202593-9a00-4f3b-9748-c5f48b7c475e',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}]}],
'cell:type': 'code'},
{'cell:id': {'@id': '31d8a531-2334-4243-9c78-25137ce58637'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '55ba1060-beca-4342-a780-db7fd9c8b5ac'},
'cell:metadata': {'@graph': {}},
'cell:type': 'code'},
{'cell:id': {'@id': '3e3d2b0b-81b5-4193-82d6-ec27bf37eaa2'},
'cell:metadata': [{'@graph': {}},
{'@graph': {'@id': '#from_remote',
'@type': 'builtins:#dict',
'cell:id': '3e3d2b0b-81b5-4193-82d6-ec27bf37eaa2',
'session:id': 'f96335f1-429c-4c08-a07e-9aa81224d66d'}}],
'cell:type': 'code'},
{'cell:id': {'@id': 'aa2688dd-5237-41ec-981a-7e9b8e2a67b0'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': 'b461d781-4ec6-4d4d-a49e-251e908421f5'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'},
{'cell:id': {'@id': '30103d50-0741-4a56-95e1-d49c880ef6c3'},
'cell:metadata': {'@graph': {}},
'cell:type': 'markdown'}]}}
this notebook is certified to have metadata
things we capture
in this proof of concept we don't capture much, but we do expose machinery to test this concept further and extend.
we capture:
kernel session id which can verify the outputs are generated in the same session
each cell id that makes it possible link back to the source cells.
some python variable information.
things we can capture in the graph.
annotations are type to id mappings.
we could trace function calls
with could encode imports
we could capture variable assignment