Skip to content

where do all the good links go?¤

extractling links and definitions from markdown using markdown_it tokens

    def get_exporter(key="mkdocs", **kw):
        with __import__("importnb").Notebook():
            from tonyfast.xxii.__markdownish_notebook import template, HEAD, replace_attachments, PidgyExporter
        kw.setdefault("template_file", key)
        exporter = PidgyExporter(**kw)
        exporter.environment.filters.setdefault("attachment", replace_attachments)
        from jinja2 import DictLoader
        for loader in exporter.environment.loader.loaders:
            if isinstance(loader, DictLoader):
                loader.mapping[key] = template
                loader.mapping["HEAD"] = HEAD
                break
        return exporter
    with __import__("importnb").Notebook():
        from tonyfast.xxiii.__duckdb_search import *
        from tonyfast.xxii.__markdownish_notebook import PidgyExporter, template   

    from midgy import Python
    import nbformat
    from markdown_it.tree import SyntaxTreeNode
    @dataclasses.dataclass
    class Finder:
        dir: str = ".."
        include: str = "*.ipynb\n*.md"
        exclude: str = ".ipynb_checkpoints"

        def get_files_stats(self, path):
            stat = path.stat()
            return dict(path=path, suffix=path.suffix, created_at=stat.st_ctime, modified_at=stat.st_mtime, size=stat.st_size)

        def get_files(self) -> list[dict]:
            return list(map(self.get_files_stats, iter_files(self.dir, self.include, self.exclude)))

        def __iter__(self):
            yield from self.get_files()

        def to_frame(self, updated_from=None):
            df = pandas.DataFrame(self).set_index("path")
            if updated_from is not None:
                return df[df.modified_at.ne(updated_from.modified_at)]
            return df

        def to_dask(self):
            from dask.dataframe import from_pandas
            return from_pandas(df := self.to_frame(), npartitions=len(df))
    order = dict([("cells", "O"), ("metadata", "O"), ("nbformat", int), ("nbformat_minor", int)])
    (
        ddf := Finder().to_dask()
    )
    ddf = ddf.assign(loader=ddf.suffix.apply({".md": get_markdown_file, ".ipynb": nbformat.v4.reads}.get, meta=("loader", "O")))
    ddf = ddf.assign(
        data=ddf.apply(lambda s: s.loader(s.name.read_text()), axis=1, meta=("data", "O"))
    )
    ddf = ddf.assign(md = ddf.data.apply(
        compose_left(get_exporter().from_notebook_node, first), meta=("md", "O")))
    ddf = ddf.assign(tokens=ddf.md.apply(Python().parse, meta=("tokens", "O")))
    ddf
Dask DataFrame Structure:
suffix created_at modified_at size loader data md tokens
npartitions=50
../xxii/2022-11-12-async-import.ipynb object float64 float64 int64 object object object object
../xxii/2022-11-12-pluggy-experiments.ipynb ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ... ...
../xxiii/vendor/tree-sitter-python/bindings/rust/README.md ... ... ... ... ... ... ... ...
../xxiii/vendor/tree-sitter-python/bindings/rust/README.md ... ... ... ... ... ... ... ...
Dask Name: assign, 12 graph layers

how many tokens are there?¤

    s = ddf.tokens.apply(compose_left(
        SyntaxTreeNode, operator.methodcaller("walk"), list
    ), meta=("token", "O")).explode()
    s.apply(operator.attrgetter("type"), meta=("type", "O")).value_counts().compute()
text                  4663
html_inline           3739
inline                1023
paragraph              768
softbreak              615
code_inline            574
fence                  353
html_block             310
heading                242
code_block             204
list_item              186
link                   101
bullet_list             61
definition              50
root                    50
strong                  41
em                      16
blockquote              13
td                       9
ordered_list             8
image                    7
tr                       4
th                       3
dd                       2
footnote_reference       1
footnote_ref             1
dt                       1
table                    1
tbody                    1
dl                       1
thead                    1
Name: type, dtype: int64
    links = s[
        s.apply(compose_left(operator.attrgetter("type"), "link image definition".split().__contains__), meta=("link", bool))
    ].compute()
    links.apply(compose_left(operator.attrgetter("attrs", "meta"), merge, pandas.Series))
href title id url label src alt
path
../xxii/2022-11-12-async-import.ipynb https://gist.github.com/Rich-Harris/0b6f317657... NaN NaN NaN NaN NaN NaN
../xxii/2022-11-12-async-import.ipynb https://docs.python.org/3/library/ast.html#ast... NaN NaN NaN NaN NaN NaN
../xxii/2022-11-12-pluggy-experiments.ipynb https://pluggy.readthedocs.io/ NaN NaN NaN NaN NaN NaN
../xxii/2022-11-17-assignment-expression-display.ipynb https://peps.python.org/pep-0572/ NaN NaN NaN NaN NaN NaN
../xxii/2022-11-23-better-dask-shape.ipynb oct/2022-10-05-dask-search.ipynb NaN NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ...
../xxiii/vendor/tree-sitter-python/bindings/rust/README.md NaN LANGUAGE FUNC https://docs.rs/tree-sitter-python/*/tree_sitt... language func NaN NaN
../xxiii/vendor/tree-sitter-python/bindings/rust/README.md NaN PARSER https://docs.rs/tree-sitter/*/tree_sitter/stru... Parser NaN NaN
../xxiii/vendor/tree-sitter-python/bindings/rust/README.md NaN TREE-SITTER https://tree-sitter.github.io/ tree-sitter NaN NaN
../xxiii/vendor/tree-sitter-python/bindings/rust/README.md NaN TREE-SITTER CRATE https://crates.io/crates/tree-sitter tree-sitter crate NaN NaN
../xxiii/vendor/tree-sitter-python/bindings/rust/README.md NaN TREE-SITTER DISCUSSIONS https://github.com/tree-sitter/tree-sitter/dis... tree-sitter discussions NaN NaN

159 rows × 7 columns