revisit building the mast notebooksยค
combine the toc/execution work from the mast notebooks with site aggregration work.
in the example, we hot patch reprs to make them accessible.
import tonyfast.utils, pandas, json, nbconvert, inspect, textwrap, nbclient, nbformat, operator, bs4, anyio, pathlib, re, os, traitlets
__import__("nest_asyncio").apply()
if MAIN := __name__ == "__main__":
class Config:
dir = pathlib.Path("mast_notebooks").absolute()
paths = ["mast_notebooks"]
target = pathlib.Path("mast_out").absolute()
exporter = nbconvert.get_exporter("a11y")(
exclude_input_prompt=True,
include_sa11y=False,
exclude_output_prompt=True,
hide_anchor_links=True,
include_settings=True,
exclude_anchor_links=True,
embed_images=True,
validate_nb=False,
include_visibility=True
)
FILE = globals().get("__file_") or os.environ.get("WRITE")
async def from_notebook_node(nb, resources=None, exporter=Config.exporter):
return exporter.from_notebook_node(nb, resources=resources)[0]
load in all the notebooks
find all the notebooks resembling a post. we are skipping the work needing to be done on the indexes and readmes. indexes and readmes use different exporter configurations than content notebooks.
toc = (
await pandas.Index([Config.dir / "_toc.yml"], name="path").apath().apath.load()
).series()
config = (
await pandas.Index([Config.dir / "_config.yml"], name="path").apath().apath.load()
).series().T.iloc[:,0]
chapters = toc.parts.enumerate("chapter").series()
sections = chapters.chapters.enumerate("section").series()
files = sections.sections.dropna().enumerate("section").series().combine_first(
sections[["file"]].set_index(pandas.Index([0]*len(sections), name="section"), append=True)
)
chapters = toc.parts.enumerate("chapter").series()
sections = chapters.chapters.enumerate("section").series()
files = sections.sections.dropna().enumerate("subsection").series().combine_first(
sections[["file"]].set_index(pandas.Index([0]*len(sections), name="subsection"), append=True)
)
paths = ("mast_notebooks" / files.file.apath())
print(F"{(~paths.path().path.exists()).sum()} files missing")
paths = (await paths[await paths.apath().apath.exists()].apath.absolute()).pipe(pandas.Index)
df = paths.to_series()
df = df.loc[~df.astype(str).str.contains("checkpoint")].pipe(pandas.Index).rename("file")
df = pandas.DataFrame(index=df[df.apath.suffix.eq(".ipynb")])
dependencies = await (await (
pandas.Index(["mast_notebooks/"]).apath().apath.rglob("requirements.txt")
)).pipe(pandas.Index).apath.read_text()
versions = dependencies.apply(str.splitlines).explode().str.extract(
"^(?P<package>[a-z|A-Z|_|-|0-9]+)\s*(?P<constraint>[\>|\<|=]*)?\s*(?P<version>\S*)?"
)
import yaml; from pathlib import Path
deps = versions.package.dropna().drop_duplicates().tolist()
deps = [{"git": "GitPython"}.get(x,x) for x in deps ]
Path("environment.yml").write_text(yaml.safe_dump(dict(
name="mast_notebooks",
channels=["conda-forge"],
dependencies=["python=3.11", "pip", dict(
pip=deps+ ["ipykernel", "astrocut", "lxml"]
)]
)))
%%bash
mamba env create -p.mast_nb -f environment.yml
mamba run -p.mast_nb --live-stream pip install -e../../../nbconvert-a11y
# mamba update -p.mast_nb -f environment.yml --force-reinstall
%%bash
mamba run -p.mast_nb python -m ipykernel install --user --name mast_nb
df = (await df.index.apath.read_text()).apply(json.loads)\
.rename("nb").apply(nbformat.from_dict).pipe(df.join)
if not FILE:
df = df.head(10)
df
import nbclient
the notebooks require preparation before they can transform to html
MIDGY = re.compile("^\%\%[\s+,(pidgy),(midgy)]")
def prepare_cell(cell):
"""make inplace changes to the notebook that carried through the publishing process"""
cell.source = "".join(cell.source)
if MIDGY.match(cell.source):
cell.metadata.setdefault("jupyter", {})["source_hidden"] = True
for out in cell.get("outputs", ""):
for k, v in out.get("data", {}).items(): k.startswith("text") and out["data"].__setitem__(k, "".join(v))
if "text" in out: out.text = "".join(out.text)
return cell
cells = df.nb.itemgetter("cells").enumerate("index").apply(prepare_cell).series()
code = cells.loc[cells.cell_type.eq("code"), :]
_idgy = code[code.source.str.contains("\s*%(?:re)load_ext\s+[pm]idgy")]
df.loc[_idgy.index.get_level_values(0).drop_duplicates()].apply(
lambda x: [
y["metadata"].setdefault("jupyter", {}).setdefault("source_hidden", True)
for y in x.loc["nb"]["cells"] if y["cell_type"] == "code"
] and None, axis=1
);
def define_table_repr():
%load_ext nbconvert_a11y.outputs
%load_ext nbconvert_a11y.tables
import astropy.table
from nbconvert_a11y.outputs import BeautifulSoup
from nbconvert_a11y.tables import repr_semantic, get_table, SHOW_INDEX, repr_semantic_update
@repr_semantic.register(astropy.table.Table)
def repr_astropy_table(table, *args, **kwargs):
return get_table(obs.to_pandas(), BeautifulSoup(obs._base_repr_(True)).i.text, type_=type(table), SEMANTIC=False, ROW_INDEX=SHOW_INDEX.hide)
repr_semantic_update()
def inject_a11y(nb, LINES = "".join(inspect.getsourcelines(define_table_repr)[0][1:])):
if nb.cells[1] is not LINES:
nb.cells.insert(1, nbformat.v4.new_code_cell(LINES, metadata=dict(jupyter=dict(source_hidden=True))))
return nb
df.nb = df.nb.apply(nbformat.from_dict).apply(inject_a11y)
client = df.nb.apply(
nbclient.NotebookClient, kernel_name="mast_nb", allow_errors=True
)
df.nb = (
await client.head(3).apply(nbclient.NotebookClient.async_execute).gather()
).combine_first(df.nb)
def render_markdown_output(output):
if "data" in output:
if "text/markdown" in output["data"]:
md = Config.exporter.environment.globals["markdown"](output["data"]["text/markdown"])
output["data"]["text/html"] = md
return md
outputs = cells.outputs.dropna().enumerate("output").dropna()
outputs.apply(render_markdown_output);
markdowns = cells[cells.cell_type.eq("markdown")].apply(
lambda s: operator.setitem(
s.metadata.setdefault("data", {}),
"text/html",
html := Config.exporter.environment.filters["markdown2html"](dict(cell=s), s.source),
)
or html,
axis=1,
).to_frame("html").assign(output=-1).set_index("output", append=True)
screate intermediate representations of markdown. when we handle this work before templating we can use partial information from the outcome to build the table of contents and relative links.
html = pandas.concat(
[
markdowns,
outputs.itemgetter("data").dropna().itemgetter("text/html").dropna().to_frame("html"),
]
).sort_index()
html["soup"] = html.html.apply(bs4.BeautifulSoup, features="lxml")
extract the headings from each cell
html["h"] = html.soup.methodcaller("select", "h1,h2,h3,h4,h5,h6")
h = html.h.enumerate("h").dropna()
expand the headings into features on the dataframe
h = h.to_frame("h").assign(
level=h.attrgetter("name").str.lstrip("h").astype(int),
string=h.attrgetter("text").str.rstrip("ยถ"),
id=h.attrgetter("attrs").itemgetter("id")
); h.head()
extract the document title from the headings. _we should probably extract a description too. adding description to the meta is good for accessibility when choosing tabs.
h
df.assign(title=h.groupby(h.index.get_level_values("file")).apply(
lambda s: s.sort_values("level").string.iloc[0]
))
df = df.assign(title=h.groupby(h.index.get_level_values("file")).apply(
lambda s: s.sort_values("level").string.iloc[0]
).rename("title"))
df = df.assign(description=html.soup.methodcaller("select_one", "p").dropna().attrgetter("text").groupby(
"file"
).apply(lambda x: x.sort_index().iloc[0]).rename("description").reindex(df.index))
df.apply(
lambda x: (
x.title and x.loc["nb"].metadata.setdefault("title", x.title),
x.description and x.loc["nb"].metadata.setdefault("description", x.description)
), axis=1
);
make a table of contents details > nav > ol
for a dataframe
def make_toc(df):
toc = bs4.BeautifulSoup(features="lxml")
toc.append(nav := toc.new_tag("nav"))
nav.append(ol := toc.new_tag("ol"))
last_level = 1
for i, row in df.iterrows():
if row.string:
if row.level > last_level:
for i in range(last_level, row.level):
last_level = i + 1
ol.append(li := toc.new_tag("li"))
li.append(ol := toc.new_tag("ol"))
else:
for i in range(row.level, last_level):
if i == 1:
continue
if ol.parent and ol.parent.parent:
ol = ol.parent.parent
ol.append(li := toc.new_tag("li"))
li.append( a:= toc.new_tag("a"))
a.append(row.string)
a.attrs.update(href=F"#{row.id}")
return toc
generate the table of contents for each file we have indexed
df = df.assign(toc=h.groupby(h.index.get_level_values("file")).apply(make_toc).apply(str))
determine the location of the html version of the file.
df = df.assign(target=(await (
Config.target / df.index.apath.relative_to(Config.dir)
).apath().apath.with_suffix(".html").apath.absolute()).values)
df = df.assign(target=(await (
Config.target / df.index.apath.relative_to(Config.dir)
).apath().apath.with_suffix(".html").apath.absolute()).values)
df = df.assign(**pandas.DataFrame([
[None] + df.index.values[:-1].tolist(), df.index.values, df.index.values[1:].tolist() + [None]
], index=["prev", "file", "next"]).T.set_index("file"))
def relative_path(source, target):
"""compute a relative path from source to target"""
if target:
common = []
if not source.is_absolute():
source = pathlib.Path(source).absolute()
if not target.is_absolute():
target = pathlib.Path(target).absolute()
for common, (s, t) in enumerate(zip(source.parts, target.parts)):
if s != t: break
return type(source)(*[".."]*(len(target.parents)-common), *target.parts[common:])
generate the footer that contains the previous and next links
df = df.assign(
footer = df.apply(
lambda s: (s.prev and F"""<a href="{relative_path(s.target, df.loc[s.prev].target)}" rel="prev><span aria-hidden=" true"=""><{df.loc[s.prev].title}</a><br/>""" or "")
+ (s.next and F"""<a href="{relative_path(s.target, df.loc[s.next].target)}" rel="next">{df.loc[s.next].title} <span aria-hidden="true">></span></a><br/>""" or ""),
axis=1
)
)
me = """<p>mast notebooks</p>"""
df = df.assign(
header = df.apply(
lambda s: me + "<details><summary>site navigation</summary><nav><ol>%s</ol></nav></details>"% "".join(
F"""<li><a href="{relative_path(s.target, t.target)}">{t.title}</a></li>"""
for i, t in df.iterrows()
), axis=1
))
await df.target.apath.parent.drop_duplicates().apath.mkdir(exist_ok=True, parents=True);
df["html"] = await df[["nb"]].apply(
lambda s: from_notebook_node(s["nb"], dict(toc=df.toc.loc[s.name], footer=df.loc[s.name].footer, header=df.loc[s.name].header)), axis=1).gather()
create an environment.yml file from the verions information previously collected
example outputs with accessible reprsยค
if 0 or FILE:
await df.target.apath.parent.drop_duplicates().apath.mkdir(exist_ok=True, parents=True);
await df.apply(
lambda s: print(F"""writing {s.target.as_uri()}""") or s.target.write_text(str(s.loc["html"])), axis=1
).gather()
else:
df.html.head(2).display.iframe().display()