Skip to content

build the site docsยค

building documentation with nbconvert and dataframes. this approach treats document as data using the nbformat at the primary interface for multimedia content.

the following is an example rendering https://tonyfast.github.io/tonyfast/draft/tonyfast/tonyfast/tonyfast/xxiii/2023-12-13-jupyter-community-call.html . something got wonky with the paths.

    import tonyfast.utils, pandas, json, nbconvert, nbformat, operator, bs4, anyio, pathlib, re, os
    __import__("nest_asyncio").apply()
    if MAIN := __name__ == "__main__":
        class Config:
            dir = pathlib.Path(globals().get("__file_") or "2024-02-21-build-docs-pd.ipynb").absolute().parent.parent.parent
            paths = ["tonyfast"]
            target = pathlib.Path("site/draft")
            exporter = nbconvert.get_exporter("a11y")(
                exclude_input_prompt=True, 
                include_sa11y=True,
                exclude_output_prompt=True,
                hide_anchor_links=True,
                include_settings=True,
                exclude_anchor_links=True,
                embed_images=True,
                validate_nb=False,
                include_visibility=True
            )
    FILE = globals().get("__file_") or os.environ.get("WRITE")
    async def from_notebook_node(nb, resources=None, exporter=Config.exporter):
        return exporter.from_notebook_node(nb, resources=resources)[0]

load in all the notebooks

find all the notebooks resembling a post. we are skipping the work needing to be done on the indexes and readmes. indexes and readmes use different exporter configurations than content notebooks.

    df = (
        Config.dir / pandas.Index(Config.paths)
    ).rename("directory").path(
    ).glob("**/[0-9][0-9][0-9][0-9]-*.ipynb", recursive=True).apath().pipe(
        pandas.Index, name="files"
    ).to_series()

remove checkpoint files.

    df = df.loc[~df.astype(str).str.contains("checkpoint")].pipe(pandas.Index).rename("file")

extract the date from the title. this can later be enriched with git information

    TITLE = "(?P<year>[0-9]{4})-(?P<month>[0-9]{2})-(?P<day>[0-9]{1,2})-(?P<slug>.+)"
    df = df.apath.stem.str.extract(TITLE).set_index(df).dropna(how="all")
    df["date"] = pandas.to_datetime(df.year +"-"+ df.month +"-"+ df.day)
    df = df.sort_values("date", ascending=False)

read in all the notebooks

    df = (await df.index.apath.read_text()).apply(json.loads)\
    .rename("nb").apply(nbformat.from_dict).pipe(df.join)
    if not FILE:
        df = df.head(10)
    df
year month day slug date nb
file
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-15-screen-tests.ipynb 2024 03 15 screen-tests 2024-03-15 {'cells': [{'attachments': {}, 'cell_type': 'm...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-13-notebooks-for-all-march-call.ipynb 2024 03 13 notebooks-for-all-march-call 2024-03-13 {'cells': [{'cell_type': 'markdown', 'id': '11...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-12-nbconvert-a11y-pres.ipynb 2024 03 12 nbconvert-a11y-pres 2024-03-12 {'cells': [{'cell_type': 'markdown', 'id': '39...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-11-NOA.ipynb 2024 03 11 NOA 2024-03-11 {'cells': [{'attachments': {'8ea12e8f-45e6-4ea...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-06-mast-revisit.ipynb 2024 03 06 mast-revisit 2024-03-06 {'cells': [{'cell_type': 'markdown', 'id': '55...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-05-traceback-figure-tables.ipynb.ipynb 2024 03 05 traceback-figure-tables.ipynb 2024-03-05 {'cells': [{'cell_type': 'markdown', 'id': '93...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-05-astropy-table-repr.ipynb 2024 03 05 astropy-table-repr 2024-03-05 {'cells': [{'attachments': {'edb162e8-136d-437...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-04-semantic-outputs.ipynb 2024 03 04 semantic-outputs 2024-03-04 {'cells': [{'cell_type': 'markdown', 'id': '25...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-01-a11y-list-string.ipynb.ipynb 2024 03 01 a11y-list-string.ipynb 2024-03-01 {'cells': [{'attachments': {'931392ee-8c73-455...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-01-notebook-accessibility-workshop.ipynb 2024 03 01 notebook-accessibility-workshop 2024-03-01 {'cells': [{'cell_type': 'markdown', 'id': '34...

the notebooks require preparation before they can transform to html

    MIDGY = re.compile("^\%\%[\s+,(pidgy),(midgy)]")

    def prepare_cell(cell):
        """make inplace changes to the notebook that carried through the publishing process"""
        cell.source = "".join(cell.source)
        if MIDGY.match(cell.source):
            cell.metadata.setdefault("jupyter", {})["source_hidden"] = True
        for out in cell.get("outputs", ""):
            for k, v in out.get("data", {}).items():  k.startswith("text") and out["data"].__setitem__(k, "".join(v))
            if "text" in out: out.text = "".join(out.text)
        return cell
    cells = df.nb.itemgetter("cells").enumerate("index").apply(prepare_cell).series()
    code = cells.loc[cells.cell_type.eq("code"), :]
    _idgy = code[code.source.str.contains("\s*%(?:re)load_ext\s+[pm]idgy")]
    df.loc[_idgy.index.get_level_values(0).drop_duplicates()].apply(
        lambda x: [
            y["metadata"].setdefault("jupyter", {}).setdefault("source_hidden", True)
            for y in x.loc["nb"]["cells"] if y["cell_type"] == "code"
        ] and None, axis=1 
    );
    def render_markdown_output(output):
        if "data" in output:
            if "text/markdown" in output["data"]:
                md = Config.exporter.environment.globals["markdown"](output["data"]["text/markdown"])
                output["data"]["text/html"] = md
                return md
    outputs = cells.outputs.dropna().enumerate("output").dropna()
    outputs.apply(render_markdown_output);
    markdowns = cells[cells.cell_type.eq("markdown")].apply(
        lambda s: operator.setitem(
            s.metadata.setdefault("data", {}),
            "text/html",
            html := Config.exporter.environment.filters["markdown2html"](dict(cell=s), s.source),
        )
        or html,
        axis=1,
    ).to_frame("html").assign(output=-1).set_index("output", append=True)

screate intermediate representations of markdown. when we handle this work before templating we can use partial information from the outcome to build the table of contents and relative links.

    html = pandas.concat(
        [
            markdowns,
            outputs.itemgetter("data").dropna().itemgetter("text/html").dropna().to_frame("html"),

        ]
    ).sort_index()

    html["soup"] = html.html.apply(bs4.BeautifulSoup, features="lxml")

extract the headings from each cell

    html["h"] = html.soup.methodcaller("select", "h1,h2,h3,h4,h5,h6")
    h = html.h.enumerate("h").dropna()

expand the headings into features on the dataframe

    h = h.to_frame("h").assign(
        level=h.attrgetter("name").str.lstrip("h").astype(int),
        string=h.attrgetter("text").str.rstrip("ยถ"),
        id=h.attrgetter("attrs").itemgetter("id")
    ); h.head()
h level string id
file index output h
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-01-a11y-list-string.ipynb.ipynb 0 -1 0 [using microdata and semantic html to represen... 1 using microdata and semantic html to represent... None
4 0 0 [representing constants] 2 representing constants representing-constants
5 0 0 [representing strings] 2 representing strings representing-strings
6 0 0 [representing numbers] 2 representing numbers representing-numbers
7 -1 0 [representing containers] 2 representing containers None

extract the document title from the headings. _we should probably extract a description too. adding description to the meta is good for accessibility when choosing tabs.

    h
h level string id
file index output h
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-01-a11y-list-string.ipynb.ipynb 0 -1 0 [using microdata and semantic html to represen... 1 using microdata and semantic html to represent... None
4 0 0 [representing constants] 2 representing constants representing-constants
5 0 0 [representing strings] 2 representing strings representing-strings
6 0 0 [representing numbers] 2 representing numbers representing-numbers
7 -1 0 [representing containers] 2 representing containers None
... ... ... ... ... ... ... ...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-13-notebooks-for-all-march-call.ipynb 0 -1 0 [notebooks for all community summary] 1 notebooks for all community summary None
2 -1 0 [full video] 2 full video None
4 -1 0 [clips] 2 clips None
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-15-screen-tests.ipynb 0 -1 0 [revisiting nbviewer/nbconvert screen reader e... 1 revisiting nbviewer/nbconvert screen reader ex... None
5 -1 0 [creating an accessible version of the document] 2 creating an accessible version of the document None

78 rows ร— 4 columns

    df.assign(title=h.groupby(h.index.get_level_values("file")).apply(
        lambda s: s.sort_values("level").string.iloc[0]
    ))
year month day slug date nb title
file
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-15-screen-tests.ipynb 2024 03 15 screen-tests 2024-03-15 {'cells': [{'attachments': {}, 'cell_type': 'm... revisiting nbviewer/nbconvert screen reader ex...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-13-notebooks-for-all-march-call.ipynb 2024 03 13 notebooks-for-all-march-call 2024-03-13 {'cells': [{'cell_type': 'markdown', 'id': '11... notebooks for all community summary
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-12-nbconvert-a11y-pres.ipynb 2024 03 12 nbconvert-a11y-pres 2024-03-12 {'cells': [{'cell_type': 'markdown', 'id': '39... semantically meaningful notebooks
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-11-NOA.ipynb 2024 03 11 NOA 2024-03-11 {'cells': [{'attachments': {'8ea12e8f-45e6-4ea... Astronomy Notebooks for All
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-06-mast-revisit.ipynb 2024 03 06 mast-revisit 2024-03-06 {'cells': [{'cell_type': 'markdown', 'id': '55... revisit building the mast notebooks
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-05-traceback-figure-tables.ipynb.ipynb 2024 03 05 traceback-figure-tables.ipynb 2024-03-05 {'cells': [{'cell_type': 'markdown', 'id': '93... using tables to structure tracebacks
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-05-astropy-table-repr.ipynb 2024 03 05 astropy-table-repr 2024-03-05 {'cells': [{'attachments': {'edb162e8-136d-437... astropy tables as semantic tables
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-04-semantic-outputs.ipynb 2024 03 04 semantic-outputs 2024-03-04 {'cells': [{'cell_type': 'markdown', 'id': '25... improved output semantic for python objects
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-01-a11y-list-string.ipynb.ipynb 2024 03 01 a11y-list-string.ipynb 2024-03-01 {'cells': [{'attachments': {'931392ee-8c73-455... using microdata and semantic html to represent...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/2024-03-01-notebook-accessibility-workshop.ipynb 2024 03 01 notebook-accessibility-workshop 2024-03-01 {'cells': [{'cell_type': 'markdown', 'id': '34... notebooks and accessibility workshop
    df = df.assign(title=h.groupby(h.index.get_level_values("file")).apply(
        lambda s: s.sort_values("level").string.iloc[0]
    ).rename("title"))
    df = df.assign(description=html.soup.methodcaller("select_one", "p").dropna().attrgetter("text").groupby(
        "file"
    ).apply(lambda x: x.sort_index().iloc[0]).rename("description").reindex(df.index))
    df.apply(
        lambda x: (
            x.title and x.loc["nb"].metadata.setdefault("title", x.title),
            x.description and x.loc["nb"].metadata.setdefault("description", x.description)
        ), axis=1
    );

make a table of contents details &gt; nav &gt; ol for a dataframe

    def make_toc(df):        
        toc = bs4.BeautifulSoup(features="lxml")
        toc.append(nav := toc.new_tag("nav"))
        nav.append(ol := toc.new_tag("ol"))
        last_level = 1
        for i, row in df.iterrows():
            if row.string:
                if row.level &gt; last_level:
                    for i in range(last_level, row.level):
                        last_level = i + 1
                        ol.append(li := toc.new_tag("li"))
                        li.append(ol := toc.new_tag("ol"))
                else:
                    for i in range(row.level, last_level):
                        if i == 1:
                            continue
                        if ol.parent and ol.parent.parent:
                            ol = ol.parent.parent
                ol.append(li := toc.new_tag("li"))
                li.append( a:= toc.new_tag("a"))
                a.append(row.string)
                a.attrs.update(href=F"#{row.id}")
        return toc

generate the table of contents for each file we have indexed

    df = df.assign(toc=h.groupby(h.index.get_level_values("file")).apply(make_toc).apply(str))

determine the location of the html version of the file.

    df = df.assign(target=(await (
        Config.target / df.index.apath.relative_to(Config.dir)
    ).apath().apath.with_suffix(".html").apath.absolute()).values)
    df = df.assign(**pandas.DataFrame([
            [None] + df.index.values[:-1].tolist(), df.index.values, df.index.values[1:].tolist() + [None]
    ], index=["prev", "file", "next"]).T.set_index("file"))
    def relative_path(source, target):
        """compute a relative path from source to target"""
        if target:
            common = []
            if not source.is_absolute():
                source = pathlib.Path(source).absolute()
            if not target.is_absolute():
                target = pathlib.Path(target).absolute()
            for common, (s, t) in enumerate(zip(source.parts, target.parts)):
                if s != t: break
            return type(source)(*[".."]*(len(target.parents)-common), *target.parts[common:])

generate the footer that contains the previous and next links

    df = df.assign(
        footer = df.apply(
                lambda s: (s.prev and F"""<a href="{relative_path(s.target, df.loc[s.prev].target)}" rel="prev&gt;&lt;span aria-hidden=" true"="">&lt;{df.loc[s.prev].title}</a><br/>""" or "")
                + (s.next and F"""<a href="{relative_path(s.target, df.loc[s.next].target)}" rel="next">{df.loc[s.next].title} <span aria-hidden="true">&gt;</span></a><br/>""" or ""),
                axis=1
        )
    )
    me = """<p><a aria-description="opens new tab" href="https://github.com/tonyfast" rel="me" style="font-size: bigger;" target="_blank">@tonyfast</a>s notebooks</p>"""
    df = df.assign(
        header = df.apply(
        lambda s: me + "<details><summary>site navigation</summary><nav><ol>%s</ol></nav></details>"% "".join(
            F"""<li><a href="{relative_path(s.target, t.target)}">{t.title}</a></li>"""
            for i, t in df.iterrows()
        ), axis=1
    ))
    await df.target.apath.parent.drop_duplicates().apath.mkdir(exist_ok=True, parents=True);
    df["html"] = await df[["nb"]].apply(
        lambda s: from_notebook_node(s["nb"], dict(toc=df.toc.loc[s.name], footer=df.loc[s.name].footer, header=df.loc[s.name].header)), axis=1).gather()
    if 0 or FILE:
        await df.target.apath.parent.drop_duplicates().apath.mkdir(exist_ok=True, parents=True);
        await df.apply(
            lambda s: print(F"""writing {s.target.as_uri()}""") or s.target.write_text(str(s.loc["html"])), axis=1
        ).gather()
    else:
        df.html.head(2).display.iframe().display()
    readmes = df.groupby(df.target.apath.parent / "index.html").apply(
        (top_toc := lambda x: x.apply(
          lambda y: F"""<li><a href="{relative_path(y.target.parent / " index.html",="" y.target)}"="">{y.loc["title"]}</a></li>""", axis=1
        ).pipe(lambda df: "<nav><ul>{}</ul></nav>".format("".join(df.values))))
    ).apply(nbformat.v4.new_markdown_cell).apply(lambda x: nbformat.v4.new_notebook(cells=[x])).to_frame("nb")
    readmes = readmes.assign(target=Config.target / readmes.index.apath.relative_to(Config.dir))
    await readmes.target.apath().apath.parent.drop_duplicates().apath.mkdir(exist_ok=True, parents=True);
    content = readmes.nb.apply(
        from_notebook_node, exporter=(
            toc_exporter := nbconvert.get_exporter("a11y")(
                exclude_input_prompt=True, 
                include_sa11y=False,
                exclude_output_prompt=True,
                exclude_anchor_links=True,
                include_toc=False,
                include_summary=False,
                table_pattern="Region"
            )
        )
    ).gather()
    primary = pandas.Series([nbformat.v4.new_notebook(cells=[nbformat.v4.new_markdown_cell(df.pipe(top_toc))])], index=[Config.target/"index.html"]).apply(
        toc_exporter.from_notebook_node
    ).apply(list).series()[0]

    if 0 or FILE:
        (await content).to_frame("html").apply(
            lambda x: print(F"write index {readmes.loc[x.name].target}") or readmes.loc[x.name].target.write_text(str(x.loc["html"])), axis=1
        )
        primary.to_frame("html").apply(
            lambda x: print(F"write index {x.name}") or x.name.write_text(str(x.loc["html"])), axis=1
        )
    else:
        (await content).display.iframe().display()
        primary.display.iframe().display()