Skip to content

revisit building the mast notebooksยค

combine the toc/execution work from the mast notebooks with site aggregration work.

in the example, we hot patch reprs to make them accessible.

    import tonyfast.utils, pandas, json, nbconvert, inspect, textwrap, nbclient, nbformat, operator, bs4, anyio, pathlib, re, os, traitlets
    __import__("nest_asyncio").apply()
    if MAIN := __name__ == "__main__":
        class Config:
            dir = pathlib.Path("mast_notebooks").absolute()
            paths = ["mast_notebooks"]
            target = pathlib.Path("mast_out").absolute()
            exporter = nbconvert.get_exporter("a11y")(
                exclude_input_prompt=True, 
                include_sa11y=False,
                exclude_output_prompt=True,
                hide_anchor_links=True,
                include_settings=True,
                exclude_anchor_links=True,
                embed_images=True,
                validate_nb=False,
                include_visibility=True
            )
    FILE = globals().get("__file_") or os.environ.get("WRITE")
    async def from_notebook_node(nb, resources=None, exporter=Config.exporter):
        return exporter.from_notebook_node(nb, resources=resources)[0]

load in all the notebooks

find all the notebooks resembling a post. we are skipping the work needing to be done on the indexes and readmes. indexes and readmes use different exporter configurations than content notebooks.

    toc = (
        await pandas.Index([Config.dir / "_toc.yml"], name="path").apath().apath.load()
    ).series()

    config = (
        await pandas.Index([Config.dir / "_config.yml"], name="path").apath().apath.load()
    ).series().T.iloc[:,0]
    chapters = toc.parts.enumerate("chapter").series()
    sections = chapters.chapters.enumerate("section").series()   
    files = sections.sections.dropna().enumerate("section").series().combine_first(
        sections[["file"]].set_index(pandas.Index([0]*len(sections), name="section"), append=True)
    )
    chapters = toc.parts.enumerate("chapter").series()
    sections = chapters.chapters.enumerate("section").series()   
    files = sections.sections.dropna().enumerate("subsection").series().combine_first(
        sections[["file"]].set_index(pandas.Index([0]*len(sections), name="subsection"), append=True)
    )
    paths = ("mast_notebooks" / files.file.apath())
    print(F"{(~paths.path().path.exists()).sum()} files missing")
    paths = (await paths[await paths.apath().apath.exists()].apath.absolute()).pipe(pandas.Index)
4 files missing
    df = paths.to_series()
    df = df.loc[~df.astype(str).str.contains("checkpoint")].pipe(pandas.Index).rename("file")
    df = pandas.DataFrame(index=df[df.apath.suffix.eq(".ipynb")])
    dependencies = await (await (
        pandas.Index(["mast_notebooks/"]).apath().apath.rglob("requirements.txt")
    )).pipe(pandas.Index).apath.read_text()
    versions = dependencies.apply(str.splitlines).explode().str.extract(
        "^(?P<package>[a-z|A-Z|_|-|0-9]+)\s*(?P<constraint>[\&gt;|\&lt;|=]*)?\s*(?P<version>\S*)?"
    )
    import yaml; from pathlib import Path
    deps = versions.package.dropna().drop_duplicates().tolist()
    deps = [{"git": "GitPython"}.get(x,x) for x in deps ]
    Path("environment.yml").write_text(yaml.safe_dump(dict(
        name="mast_notebooks",
        channels=["conda-forge"],
        dependencies=["python=3.11", "pip", dict(
            pip=deps+ ["ipykernel", "astrocut", "lxml"]
        )]
    )))
428
%%bash
mamba env create -p.mast_nb -f environment.yml
mamba run -p.mast_nb --live-stream pip install -e../../../nbconvert-a11y
# mamba update -p.mast_nb -f environment.yml --force-reinstall




%%bash
mamba run  -p.mast_nb python -m ipykernel install --user --name mast_nb
    df = (await df.index.apath.read_text()).apply(json.loads)\
    .rename("nb").apply(nbformat.from_dict).pipe(df.join)
    if not FILE:
        df = df.head(10)
    df
nb
file
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astrocut/making_tess_cubes_and_cutouts/making_tess_cubes_and_cutouts.ipynb {'cells': [{'cell_type': 'markdown', 'metadata...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/beginner_search/beginner_search.ipynb {'cells': [{'cell_type': 'markdown', 'metadata...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/beginner_zcut/beginner_zcut.ipynb {'cells': [{'cell_type': 'markdown', 'metadata...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/large_downloads/large_downloads.ipynb {'cells': [{'cell_type': 'markdown', 'id': 'b0...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/historic_quasar_observations/historic_quasar_observations.ipynb {'cells': [{'cell_type': 'markdown', 'id': '21...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/wildcard_searches/wildcard_searches.ipynb {'cells': [{'cell_type': 'markdown', 'id': 'ba...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/HSC/HCV_API/HCV_API_demo.ipynb {'cells': [{'cell_type': 'markdown', 'metadata...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/HSC/HSCV3_API/hscv3_api.ipynb {'cells': [{'cell_type': 'markdown', 'metadata...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/HSC/HSCV3_SMC_API/hscv3_smc_api.ipynb {'cells': [{'cell_type': 'markdown', 'metadata...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/HSC/HSC_TAP/HSC_TAP.ipynb {'cells': [{'cell_type': 'markdown', 'metadata...
    import nbclient

the notebooks require preparation before they can transform to html

    MIDGY = re.compile("^\%\%[\s+,(pidgy),(midgy)]")

    def prepare_cell(cell):
        """make inplace changes to the notebook that carried through the publishing process"""
        cell.source = "".join(cell.source)
        if MIDGY.match(cell.source):
            cell.metadata.setdefault("jupyter", {})["source_hidden"] = True
        for out in cell.get("outputs", ""):
            for k, v in out.get("data", {}).items():  k.startswith("text") and out["data"].__setitem__(k, "".join(v))
            if "text" in out: out.text = "".join(out.text)
        return cell
    cells = df.nb.itemgetter("cells").enumerate("index").apply(prepare_cell).series()
    code = cells.loc[cells.cell_type.eq("code"), :]
    _idgy = code[code.source.str.contains("\s*%(?:re)load_ext\s+[pm]idgy")]
    df.loc[_idgy.index.get_level_values(0).drop_duplicates()].apply(
        lambda x: [
            y["metadata"].setdefault("jupyter", {}).setdefault("source_hidden", True)
            for y in x.loc["nb"]["cells"] if y["cell_type"] == "code"
        ] and None, axis=1 
    );
    def define_table_repr():
        %load_ext nbconvert_a11y.outputs
        %load_ext nbconvert_a11y.tables
        import astropy.table
        from nbconvert_a11y.outputs import BeautifulSoup
        from nbconvert_a11y.tables import repr_semantic, get_table, SHOW_INDEX, repr_semantic_update
        @repr_semantic.register(astropy.table.Table)
        def repr_astropy_table(table, *args, **kwargs):
            return get_table(obs.to_pandas(), BeautifulSoup(obs._base_repr_(True)).i.text, type_=type(table), SEMANTIC=False, ROW_INDEX=SHOW_INDEX.hide)

        repr_semantic_update()
    def inject_a11y(nb, LINES = "".join(inspect.getsourcelines(define_table_repr)[0][1:])):
        if nb.cells[1] is not LINES:
            nb.cells.insert(1, nbformat.v4.new_code_cell(LINES, metadata=dict(jupyter=dict(source_hidden=True))))
        return nb
    df.nb = df.nb.apply(nbformat.from_dict).apply(inject_a11y)
    client = df.nb.apply(
        nbclient.NotebookClient, kernel_name="mast_nb", allow_errors=True
    )

    df.nb = (
        await client.head(3).apply(nbclient.NotebookClient.async_execute).gather()
    ).combine_first(df.nb)
    def render_markdown_output(output):
        if "data" in output:
            if "text/markdown" in output["data"]:
                md = Config.exporter.environment.globals["markdown"](output["data"]["text/markdown"])
                output["data"]["text/html"] = md
                return md
    outputs = cells.outputs.dropna().enumerate("output").dropna()
    outputs.apply(render_markdown_output);
    markdowns = cells[cells.cell_type.eq("markdown")].apply(
        lambda s: operator.setitem(
            s.metadata.setdefault("data", {}),
            "text/html",
            html := Config.exporter.environment.filters["markdown2html"](dict(cell=s), s.source),
        )
        or html,
        axis=1,
    ).to_frame("html").assign(output=-1).set_index("output", append=True)

screate intermediate representations of markdown. when we handle this work before templating we can use partial information from the outcome to build the table of contents and relative links.

    html = pandas.concat(
        [
            markdowns,
            outputs.itemgetter("data").dropna().itemgetter("text/html").dropna().to_frame("html"),

        ]
    ).sort_index()

    html["soup"] = html.html.apply(bs4.BeautifulSoup, features="lxml")

extract the headings from each cell

    html["h"] = html.soup.methodcaller("select", "h1,h2,h3,h4,h5,h6")
    h = html.h.enumerate("h").dropna()

expand the headings into features on the dataframe

    h = h.to_frame("h").assign(
        level=h.attrgetter("name").str.lstrip("h").astype(int),
        string=h.attrgetter("text").str.rstrip("ยถ"),
        id=h.attrgetter("attrs").itemgetter("id")
    ); h.head()
h level string id
file index output h
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/HSC/HCV_API/HCV_API_demo.ipynb 0 -1 0 [Hubble Catalog of Variables Notebook (API ver... 1 Hubble Catalog of Variables Notebook (API vers... None
1 [2019 - 2022, Rick White, Steve Lubow, Trenton... 3 2019 - 2022, Rick White, Steve Lubow, Trenton ... None
1 -1 0 [Instructions] 1 Instructions None
1 [Table of Contents] 1 Table of Contents None
2 -1 0 [Initialization , []] 1 Initialization None

extract the document title from the headings. _we should probably extract a description too. adding description to the meta is good for accessibility when choosing tabs.

    h
h level string id
file index output h
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/HSC/HCV_API/HCV_API_demo.ipynb 0 -1 0 [Hubble Catalog of Variables Notebook (API ver... 1 Hubble Catalog of Variables Notebook (API vers... None
1 [2019 - 2022, Rick White, Steve Lubow, Trenton... 3 2019 - 2022, Rick White, Steve Lubow, Trenton ... None
1 -1 0 [Instructions] 1 Instructions None
1 [Table of Contents] 1 Table of Contents None
2 -1 0 [Initialization , []] 1 Initialization None
... ... ... ... ... ... ... ...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/wildcard_searches/wildcard_searches.ipynb 21 -1 0 [Case 2: Wildcard Search with , [instrument_na... 3 Case 2: Wildcard Search with instrument_name a... None
24 -1 0 [Case 3: Create a Moving Target Ephemeris usin... 3 Case 3: Create a Moving Target Ephemeris using... None
38 -1 0 [Resources] 2 Resources None
40 -1 0 [Citations] 2 Citations None
42 -1 0 [About This Notebook] 2 About This Notebook None

155 rows ร— 4 columns

    df.assign(title=h.groupby(h.index.get_level_values("file")).apply(
        lambda s: s.sort_values("level").string.iloc[0]
    ))
nb title
file
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astrocut/making_tess_cubes_and_cutouts/making_tess_cubes_and_cutouts.ipynb {'cells': [{'cell_type': 'markdown', 'metadata... Generating Cubes and Cutouts from TESS FFIs
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/beginner_search/beginner_search.ipynb {'cells': [{'cell_type': 'markdown', 'metadata... Beginner: Searching MAST using astroquery.mast
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/beginner_zcut/beginner_zcut.ipynb {'cells': [{'cell_type': 'markdown', 'metadata... Beginner: Zcut and Astroquery Tutorial
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/large_downloads/large_downloads.ipynb {'cells': [{'cell_type': 'markdown', 'id': 'b0... Large Downloads in astroquery.mast
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/historic_quasar_observations/historic_quasar_observations.ipynb {'cells': [{'cell_type': 'markdown', 'id': '21... Historical Quasar Observations
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/astroquery/wildcard_searches/wildcard_searches.ipynb {'cells': [{'cell_type': 'markdown', 'id': 'ba... Wildcard Handling with Astroquery.mast
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/HSC/HCV_API/HCV_API_demo.ipynb {'cells': [{'cell_type': 'markdown', 'metadata... Hubble Catalog of Variables Notebook (API vers...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/HSC/HSCV3_API/hscv3_api.ipynb {'cells': [{'cell_type': 'markdown', 'metadata... Hubble Source Catalog API Notebook
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/HSC/HSCV3_SMC_API/hscv3_smc_api.ipynb {'cells': [{'cell_type': 'markdown', 'metadata... Hubble Source Catalog API Notebook: SMC Color-...
/home/tbone/Documents/tonyfast/tonyfast/xxiv/mast_notebooks/notebooks/HSC/HSC_TAP/HSC_TAP.ipynb {'cells': [{'cell_type': 'markdown', 'metadata... MAST Table Access Protocol Hubble Source Catal...
    df = df.assign(title=h.groupby(h.index.get_level_values("file")).apply(
        lambda s: s.sort_values("level").string.iloc[0]
    ).rename("title"))
    df = df.assign(description=html.soup.methodcaller("select_one", "p").dropna().attrgetter("text").groupby(
        "file"
    ).apply(lambda x: x.sort_index().iloc[0]).rename("description").reindex(df.index))
    df.apply(
        lambda x: (
            x.title and x.loc["nb"].metadata.setdefault("title", x.title),
            x.description and x.loc["nb"].metadata.setdefault("description", x.description)
        ), axis=1
    );

make a table of contents details &gt; nav &gt; ol for a dataframe

    def make_toc(df):        
        toc = bs4.BeautifulSoup(features="lxml")
        toc.append(nav := toc.new_tag("nav"))
        nav.append(ol := toc.new_tag("ol"))
        last_level = 1
        for i, row in df.iterrows():
            if row.string:
                if row.level &gt; last_level:
                    for i in range(last_level, row.level):
                        last_level = i + 1
                        ol.append(li := toc.new_tag("li"))
                        li.append(ol := toc.new_tag("ol"))
                else:
                    for i in range(row.level, last_level):
                        if i == 1:
                            continue
                        if ol.parent and ol.parent.parent:
                            ol = ol.parent.parent
                ol.append(li := toc.new_tag("li"))
                li.append( a:= toc.new_tag("a"))
                a.append(row.string)
                a.attrs.update(href=F"#{row.id}")
        return toc

generate the table of contents for each file we have indexed

    df = df.assign(toc=h.groupby(h.index.get_level_values("file")).apply(make_toc).apply(str))

determine the location of the html version of the file.

    df = df.assign(target=(await (
        Config.target / df.index.apath.relative_to(Config.dir)
    ).apath().apath.with_suffix(".html").apath.absolute()).values)
    df = df.assign(target=(await (
        Config.target / df.index.apath.relative_to(Config.dir)
    ).apath().apath.with_suffix(".html").apath.absolute()).values)
    df = df.assign(**pandas.DataFrame([
            [None] + df.index.values[:-1].tolist(), df.index.values, df.index.values[1:].tolist() + [None]
    ], index=["prev", "file", "next"]).T.set_index("file"))
    def relative_path(source, target):
        """compute a relative path from source to target"""
        if target:
            common = []
            if not source.is_absolute():
                source = pathlib.Path(source).absolute()
            if not target.is_absolute():
                target = pathlib.Path(target).absolute()
            for common, (s, t) in enumerate(zip(source.parts, target.parts)):
                if s != t: break
            return type(source)(*[".."]*(len(target.parents)-common), *target.parts[common:])

generate the footer that contains the previous and next links

    df = df.assign(
        footer = df.apply(
                lambda s: (s.prev and F"""<a href="{relative_path(s.target, df.loc[s.prev].target)}" rel="prev&gt;&lt;span aria-hidden=" true"="">&lt;{df.loc[s.prev].title}</a><br/>""" or "")
                + (s.next and F"""<a href="{relative_path(s.target, df.loc[s.next].target)}" rel="next">{df.loc[s.next].title} <span aria-hidden="true">&gt;</span></a><br/>""" or ""),
                axis=1
        )
    )
    me = """<p>mast notebooks</p>"""
    df = df.assign(
        header = df.apply(
        lambda s: me + "<details><summary>site navigation</summary><nav><ol>%s</ol></nav></details>"% "".join(
            F"""<li><a href="{relative_path(s.target, t.target)}">{t.title}</a></li>"""
            for i, t in df.iterrows()
        ), axis=1
    ))
    await df.target.apath.parent.drop_duplicates().apath.mkdir(exist_ok=True, parents=True);
    df["html"] = await df[["nb"]].apply(
        lambda s: from_notebook_node(s["nb"], dict(toc=df.toc.loc[s.name], footer=df.loc[s.name].footer, header=df.loc[s.name].header)), axis=1).gather()

create an environment.yml file from the verions information previously collected

example outputs with accessible reprsยค

    if 0 or FILE:
        await df.target.apath.parent.drop_duplicates().apath.mkdir(exist_ok=True, parents=True);
        await df.apply(
            lambda s: print(F"""writing {s.target.as_uri()}""") or s.target.write_text(str(s.loc["html"])), axis=1
        ).gather()
    else:
        df.html.head(2).display.iframe().display()