full text search for notebooks and files using duckdb¤

duckdb is great for moderate sized data. maybe it would be good for searching notebooks. i know pandas so we are going to use pandas to load in our data

reads files
load contents in the nbformat
create the table on a in memory duckdb
at full text search the columns
search the source

    import pandas, duckdb, functools

`search` is our database goal¤

the use of the search is demonstrated at the end of the document

    def search(q) -&gt; pandas.DataFrame:
        return (get_db().execute(F"""
        SELECT * FROM
        (
            SELECT *, fts_main_cells.match_bm25(path, '{q}', fields:='source') AS score FROM cells
        )
        WHERE score IS NOT NULL
        ORDER BY score DESC;
        """)).df()

https://duckdb.org/docs/extensions/full_text_search

    @functools.lru_cache # this makes our function a singleton
    def get_db() -&gt; duckdb.DuckDBPyConnection:
        con = duckdb.connect()
        con.execute("CREATE TABLE cells AS SELECT * FROM sources")
        con.execute("INSERT INTO cells SELECT * FROM sources")
        con.execute("""PRAGMA create_fts_index('cells', 'path', 'source');""")
        return con

create a shape of the cells that duckdb can use. we ignore metadata, attachments and outputs.

    def get_fts_sources(cells):
        sources = cells.drop(columns=["metadata", "attachments", "outputs"])
        sources.source = sources.source.str.join("")
        sources = sources.set_index(sources.index.map(compose_left(map(str), "#/cells/".join)).rename("path")).reset_index()
        sources.execution_count = sources.execution_count.fillna(-1)
        return sources

load all the documents in as cells¤

    def get_cells(docs):    
        return (
            docs["cells"].apply(
                compose_left(enumerate, list)
            ).explode().apply(pandas.Series)
            .rename(columns={0: "cell_ct", 1: "cell"})
            .set_index("cell_ct", append=True)["cell"]
            .apply(pandas.Series)
        )

get_files creates our first dataframes

    def get_files(dir) -&gt; pandas.DataFrame:
        files = pandas.DataFrame(index=pandas.Index(iter_files(dir), name="file"))
        return files.assign(suffix=files.index.map(operator.attrgetter("suffix")))

get_markdown_file reads a markdown file as a markdown notebook cell.

    def get_markdown_file(md):
        import nbformat
        return nbformat.v4.new_notebook(cells=[nbformat.v4.new_markdown_cell(md)])

    def get_docs(files: pandas.DataFrame) -&gt; pandas.DataFrame:
        files = files.assign(text=files.index.map(pathlib.Path.read_text))
        return pandas.concat([
            files[files.suffix.eq(".ipynb")].text.apply(compose_left(orjson.loads, pandas.Series)),
            files[files.suffix.eq(".md")].text.apply(compose_left(get_markdown_file, pandas.Series)),        
        ])

    def get_cells_frame(dir): return get_cells(get_docs(get_files(dir)))

iter_files finds files matching an include pattern, and not matching an exclude pattern

    def iter_files(dir=None, exclude=".nox\n.ipynb_checkpoints\n", include="*.md\n*.ipynb"):
        import pathspec
        exclude_spec = pathspec.PathSpec.from_lines(pathspec.GitIgnorePattern, exclude.splitlines())
        include_spec = pathspec.PathSpec.from_lines(pathspec.GitIgnorePattern, include.splitlines())            
        dir = pathlib.Path(dir or pathlib.Path.cwd())
        for f in dir.iterdir():
            if f.is_dir():
                if not exclude_spec.match_file(f):
                    yield from iter_files(f)
            if f.is_file():
                if include_spec.match_file(f):
                    if not exclude_spec.match_file(f):
                        yield f

iter_files uses a pattern i like where pathspec defines the files included and excluded. sometimes include/exclude logic can be confusing. the .gitignore convention is adopted to rely on that and point someone else's docs.

using our search function¤

    import pathspec, dataclasses, orjson, pathlib; from toolz.curried import *

initialize the pandas.DataFrame so duckdb can use it. our table in this work is cells

initialize the `duckdb` tables from pandas¤

https://duckdb.org/docs/guides/python/import_pandas.html

    if (I := "__file__" not in locals()):
        sources = get_fts_sources(get_cells_frame(".."))
        display(get_db().execute("DESCRIBE cells").df())

	column_name	column_type	null	key	default	extra
0	path	VARCHAR	YES	NaN	NaN	NaN
1	cell_type	VARCHAR	YES	NaN	NaN	NaN
2	id	VARCHAR	YES	NaN	NaN	NaN
3	source	VARCHAR	YES	NaN	NaN	NaN
4	execution_count	DOUBLE	YES	NaN	NaN	NaN

sample searches¤

    I and display(search("pandas").head())

	path	cell_type	id	source	execution_count	score
0	../xxii/oct/2022-10-29-metadata-formatter.ipyn...	code	e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0	if ACTIVE:\n import pandas\n ...	8.0	1.718594
1	../xxii/oct/2022-10-29-metadata-formatter.ipyn...	code	e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0	if ACTIVE:\n import pandas\n ...	8.0	1.718594
2	../xxiii/2023-01-02-accessible-dataframes-basi...	code	401913ff-534f-4659-aec5-0784b1f1f34c	(df := pandas.DataFrame(\n columns=...	2.0	1.679535
3	../xxiii/2023-01-11-accessible-dataframes-comp...	code	401913ff-534f-4659-aec5-0784b1f1f34c	(df := pandas.DataFrame(\n columns=...	2.0	1.679535
4	../xxiii/2023-01-02-accessible-dataframes-basi...	code	401913ff-534f-4659-aec5-0784b1f1f34c	(df := pandas.DataFrame(\n columns=...	2.0	1.679535

    I and display(search("toolz").head(4))

	path	cell_type	id	source	execution_count	score
0	../xxii/oct/colormap-dataframes/2021-10-11-col...	code	391cab50-209e-4843-8bea-0405f6734e6f	import pandas, numpy, toolz.curried as toolz	1.0	3.776350
1	../xxii/oct/colormap-dataframes/2021-10-11-col...	code	391cab50-209e-4843-8bea-0405f6734e6f	import pandas, numpy, toolz.curried as toolz	1.0	3.776350
2	../xxiii/2023-01-11-duckdb-search.ipynb#/cells/20	code	d3a6ca2a-7b1d-4f0a-a86c-9345913468c0	import pathspec, dataclasses, orjson, path...	-1.0	3.340617
3	../xxiii/2023-01-11-duckdb-search.ipynb#/cells/26	code	589d3fb6-a4bb-434f-a06e-05d57fe57f09	I and display(search("toolz").head(4))	-1.0	3.340617