unravel directories of notebooks with dask
¤
this is another pass at using dask
to load notebooks with the ultimate intent to search them.
in searching-notebooks, i first approach this task with some
keen pandas
skills that we not so kind in the dask
land.
this document takes another pass at using clearer expressions to ravel a bunch
of notebooks to dask.dataframe
taking care to load notebooks as dask.dataframe
s offers the power to apply
direct queries, export to parquet, export to sqlite, export to duckdb, arrow..
import pandas, json, jsonpointer, orjson, dask.dataframe; from pathlib import Path
from toolz.curried import *
XXX = __name__ == "__main__" and "__file__" not in locals()
if you know the shape then define it¤
dask
truly prefers explicit dtypes while pandas
is more flexible.
meta
holds our shape information for the cells, outputs, and displays
class meta:
O = "object"
ANY = None, O
NB = [("cells", O), ("metadata", O), ("nbformat", int), ("nbformat_minor", int)]
CELL = [
("cell_type", str), ("execution_count", int), ("id", str),
("metadata", O), ("outputs", O), ("source", str), ("cell_ct", int),]
OUTPUT = [
("data", O), ("metadata", O), ("ename", str), ("evalue", str),
("text", str), ("execution_count", int), ("output_type", str), ("output_ct", int)]
DISPLAY = [("type", str), ("value", str)]
new_nb = pandas.Series(index=map(first, NB), dtype="O")
new_cell = pandas.Series(index=map(first, CELL), dtype="O")
new_output = pandas.Series(index=map(first, OUTPUT), dtype="O")
new_display = pandas.Series(index=map(first, DISPLAY), dtype="O")
def enumerate_list(x, key="cell_ct"): return [{key: i, **y} for i, y in enumerate(x)]
def get_series(data, key="text", new=meta.new_output):
if key in data:
data[key] = "".join(data[key])
s = new.copy()
return s.update(data) or s
off to the races as we load some data from our local files.
WHERE = Path("oct")
the files we include start and remain our index. in prior iterations, there were a few set index operations, but we don't want to be opening files to do this cause that is costly. we'll store other metadata on the dataframe as we unpack the notebook shapes.
def get_files(WHERE=WHERE):
return dask.bag.from_sequence(
dict(file=str(x)) for x in WHERE.glob("*.ipynb")
).to_dataframe().set_index("file")
XXX and (files := get_files())
contents
loads our files in to a dataframe containg real cell contents.
each row is a file.
def get_contents_from_files(files):
return files.index.to_series().apply(
compose_left(Path, Path.read_text, orjson.loads, partial(
get_series, new=meta.new_nb)), meta=meta.NB)
XXX and (contents := get_contents_from_files(files))
the cells
are built by exploding the rows of the contents
def get_cells_from_contents(contents):
cells = contents.cells
cells = cells.apply(enumerate_list, meta=meta.ANY)
return cells.explode().apply(get_series, key="source", new=meta.new_cell, meta=meta.CELL)
if XXX:
cells = get_cells_from_contents(contents)
meta_cells = cells["metadata cell_ct".split()]; cells.pop("metadata"); display(cells)
new we deal will outputs that include display_data, stdout, and stderr.
def get_outputs_from_cells(cells):
outputs = cells["outputs cell_ct".split()].dropna(subset="outputs")
outputs.outputs = outputs.outputs.apply(enumerate_list, key="output_ct", meta=meta.ANY)
outputs = outputs.explode("outputs").dropna(subset="outputs")
return dask.dataframe.concat([
outputs.pop("outputs").apply(get_series, key="text", new=meta.new_output, meta=meta.OUTPUT),
outputs
], axis=1)
if XXX:
outputs = get_outputs_from_cells(cells)
meta_display = outputs["metadata cell_ct output_ct".split()]; outputs.pop("metadata"); display(outputs)
separating the different standard out/error displays from the rich display data. there is probably more to for managing the different types of outputs from the different reprs.
def get_display_data_from_outputs(outputs):
display_data = outputs["data execution_count output_type cell_ct output_ct".split()].dropna(subset="data")
display_data["data"] = display_data["data"].apply(compose_left(dict.items, list), meta=meta.ANY)
display_data = display_data.explode("data").dropna(subset="data")
return dask.dataframe.concat([
display_data.pop("data").apply(
compose_left(
partial(zip, meta.new_display.index), dict,
partial(get_series, key=None, new=meta.new_display)
), meta=meta.DISPLAY), display_data], axis=1)
XXX and (display_data := get_display_data_from_outputs(outputs)).compute()
where to go from¤
- extend to other files. the notebook format is a hypermedia document format.
- save to different formats. initially we think about parquet, while in theory from this dataframe we could go further an imagine it being the seed for documentation.
XXX and display(*(x.sample(frac=.1).compute().sample(5) for x in (cells, outputs, display_data)))
from dataclasses import dataclass, field
@dataclass
class Contents:
dir: Path = field(default_factory=Path.cwd)
contents: dask.dataframe.DataFrame = None
def __post_init__(self):
self.contents = get_contents_from_files(get_files(self.dir))
self.cells = get_cells_from_contents(self.contents)
self.outputs = get_outputs_from_cells(self.cells)
self.display_data = self.get_display_data_from_outputs(self.outputs)