Skip to content

full text search for notebooks and files using duckdb¤

duckdb is great for moderate sized data. maybe it would be good for searching notebooks. i know pandas so we are going to use pandas to load in our data

  1. reads files
  2. load contents in the nbformat
  3. create the table on a in memory duckdb
  4. at full text search the columns
  5. search the source
    import pandas, duckdb, functools

search is our database goal¤

the use of the search is demonstrated at the end of the document

    def search(q) -> pandas.DataFrame:
        return (get_db().execute(F"""
        SELECT * FROM
        (
            SELECT *, fts_main_cells.match_bm25(path, '{q}', fields:='source') AS score FROM cells
        )
        WHERE score IS NOT NULL
        ORDER BY score DESC;
        """)).df()

https://duckdb.org/docs/extensions/full_text_search

    @functools.lru_cache # this makes our function a singleton
    def get_db() -> duckdb.DuckDBPyConnection:
        con = duckdb.connect()
        con.execute("CREATE TABLE cells AS SELECT * FROM sources")
        con.execute("INSERT INTO cells SELECT * FROM sources")
        con.execute("""PRAGMA create_fts_index('cells', 'path', 'source');""")
        return con    

create a shape of the cells that duckdb can use. we ignore metadata, attachments and outputs.

    def get_fts_sources(cells):
        sources = cells.drop(columns=["metadata", "attachments", "outputs"])
        sources.source = sources.source.str.join("")
        sources = sources.set_index(sources.index.map(compose_left(map(str), "#/cells/".join)).rename("path")).reset_index()
        sources.execution_count = sources.execution_count.fillna(-1)
        return sources

load all the documents in as cells¤

    def get_cells(docs):    
        return (
            docs["cells"].apply(
                compose_left(enumerate, list)
            ).explode().apply(pandas.Series)
            .rename(columns={0: "cell_ct", 1: "cell"})
            .set_index("cell_ct", append=True)["cell"]
            .apply(pandas.Series)
        )

get_files creates our first dataframes

    def get_files(dir) -> pandas.DataFrame:
        files = pandas.DataFrame(index=pandas.Index(iter_files(dir), name="file"))
        return files.assign(suffix=files.index.map(operator.attrgetter("suffix")))

get_markdown_file reads a markdown file as a markdown notebook cell.

    def get_markdown_file(md):
        import nbformat
        return nbformat.v4.new_notebook(cells=[nbformat.v4.new_markdown_cell(md)])    
    def get_docs(files: pandas.DataFrame) -> pandas.DataFrame:
        files = files.assign(text=files.index.map(pathlib.Path.read_text))
        return pandas.concat([
            files[files.suffix.eq(".ipynb")].text.apply(compose_left(orjson.loads, pandas.Series)),
            files[files.suffix.eq(".md")].text.apply(compose_left(get_markdown_file, pandas.Series)),        
        ])
    def get_cells_frame(dir): return get_cells(get_docs(get_files(dir))) 

iter_files finds files matching an include pattern, and not matching an exclude pattern

    def iter_files(dir=None, exclude=".nox\n.ipynb_checkpoints\n", include="*.md\n*.ipynb"):
        import pathspec
        exclude_spec = pathspec.PathSpec.from_lines(pathspec.GitIgnorePattern, exclude.splitlines())
        include_spec = pathspec.PathSpec.from_lines(pathspec.GitIgnorePattern, include.splitlines())            
        dir = pathlib.Path(dir or pathlib.Path.cwd())
        for f in dir.iterdir():
            if f.is_dir():
                if not exclude_spec.match_file(f):
                    yield from iter_files(f)
            if f.is_file():
                if include_spec.match_file(f):
                    if not exclude_spec.match_file(f):
                        yield f

iter_files uses a pattern i like where pathspec defines the files included and excluded. sometimes include/exclude logic can be confusing. the .gitignore convention is adopted to rely on that and point someone else's docs.

using our search function¤

    import pathspec, dataclasses, orjson, pathlib; from toolz.curried import *

initialize the pandas.DataFrame so duckdb can use it. our table in this work is cells

initialize the duckdb tables from pandas¤

https://duckdb.org/docs/guides/python/import_pandas.html

    if (I := "__file__" not in locals()):
        sources = get_fts_sources(get_cells_frame(".."))
        display(get_db().execute("DESCRIBE cells").df())
column_name column_type null key default extra
0 path VARCHAR YES NaN NaN NaN
1 cell_type VARCHAR YES NaN NaN NaN
2 id VARCHAR YES NaN NaN NaN
3 source VARCHAR YES NaN NaN NaN
4 execution_count DOUBLE YES NaN NaN NaN

sample searches¤

    I and display(search("pandas").head())
path cell_type id source execution_count score
0 ../xxii/oct/2022-10-29-metadata-formatter.ipyn... code e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0 if ACTIVE:\n import pandas\n ... 8.0 1.718594
1 ../xxii/oct/2022-10-29-metadata-formatter.ipyn... code e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0 if ACTIVE:\n import pandas\n ... 8.0 1.718594
2 ../xxiii/2023-01-02-accessible-dataframes-basi... code 401913ff-534f-4659-aec5-0784b1f1f34c (df := pandas.DataFrame(\n columns=... 2.0 1.679535
3 ../xxiii/2023-01-11-accessible-dataframes-comp... code 401913ff-534f-4659-aec5-0784b1f1f34c (df := pandas.DataFrame(\n columns=... 2.0 1.679535
4 ../xxiii/2023-01-02-accessible-dataframes-basi... code 401913ff-534f-4659-aec5-0784b1f1f34c (df := pandas.DataFrame(\n columns=... 2.0 1.679535
    I and display(search("toolz").head(4))
path cell_type id source execution_count score
0 ../xxii/oct/colormap-dataframes/2021-10-11-col... code 391cab50-209e-4843-8bea-0405f6734e6f import pandas, numpy, toolz.curried as toolz 1.0 3.776350
1 ../xxii/oct/colormap-dataframes/2021-10-11-col... code 391cab50-209e-4843-8bea-0405f6734e6f import pandas, numpy, toolz.curried as toolz 1.0 3.776350
2 ../xxiii/2023-01-11-duckdb-search.ipynb#/cells/20 code d3a6ca2a-7b1d-4f0a-a86c-9345913468c0 import pathspec, dataclasses, orjson, path... -1.0 3.340617
3 ../xxiii/2023-01-11-duckdb-search.ipynb#/cells/26 code 589d3fb6-a4bb-434f-a06e-05d57fe57f09 I and display(search("toolz").head(4)) -1.0 3.340617