where do all the good links go?¤
extractling links and definitions from markdown using markdown_it
tokens
def get_exporter(key="mkdocs", **kw):
with __import__("importnb").Notebook():
from tonyfast.xxii.__markdownish_notebook import template, HEAD, replace_attachments, PidgyExporter
kw.setdefault("template_file", key)
exporter = PidgyExporter(**kw)
exporter.environment.filters.setdefault("attachment", replace_attachments)
from jinja2 import DictLoader
for loader in exporter.environment.loader.loaders:
if isinstance(loader, DictLoader):
loader.mapping[key] = template
loader.mapping["HEAD"] = HEAD
break
return exporter
with __import__("importnb").Notebook():
from tonyfast.xxiii.__duckdb_search import *
from tonyfast.xxii.__markdownish_notebook import PidgyExporter, template
from midgy import Python
import nbformat
from markdown_it.tree import SyntaxTreeNode
@dataclasses.dataclass
class Finder:
dir: str = ".."
include: str = "*.ipynb\n*.md"
exclude: str = ".ipynb_checkpoints"
def get_files_stats(self, path):
stat = path.stat()
return dict(path=path, suffix=path.suffix, created_at=stat.st_ctime, modified_at=stat.st_mtime, size=stat.st_size)
def get_files(self) -> list[dict]:
return list(map(self.get_files_stats, iter_files(self.dir, self.include, self.exclude)))
def __iter__(self):
yield from self.get_files()
def to_frame(self, updated_from=None):
df = pandas.DataFrame(self).set_index("path")
if updated_from is not None:
return df[df.modified_at.ne(updated_from.modified_at)]
return df
def to_dask(self):
from dask.dataframe import from_pandas
return from_pandas(df := self.to_frame(), npartitions=len(df))
order = dict([("cells", "O"), ("metadata", "O"), ("nbformat", int), ("nbformat_minor", int)])
(
ddf := Finder().to_dask()
)
ddf = ddf.assign(loader=ddf.suffix.apply({".md": get_markdown_file, ".ipynb": nbformat.v4.reads}.get, meta=("loader", "O")))
ddf = ddf.assign(
data=ddf.apply(lambda s: s.loader(s.name.read_text()), axis=1, meta=("data", "O"))
)
ddf = ddf.assign(md = ddf.data.apply(
compose_left(get_exporter().from_notebook_node, first), meta=("md", "O")))
ddf = ddf.assign(tokens=ddf.md.apply(Python().parse, meta=("tokens", "O")))
ddf
how many tokens are there?¤
s = ddf.tokens.apply(compose_left(
SyntaxTreeNode, operator.methodcaller("walk"), list
), meta=("token", "O")).explode()
s.apply(operator.attrgetter("type"), meta=("type", "O")).value_counts().compute()
all the links¤
links = s[
s.apply(compose_left(operator.attrgetter("type"), "link image definition".split().__contains__), meta=("link", bool))
].compute()
links.apply(compose_left(operator.attrgetter("attrs", "meta"), merge, pandas.Series))