skip to main content

@tonyfast s notebooks

site navigation
notebook summary
title
full text search for notebooks and files using duckdb
description
duckdb is great for moderate sized data. maybe it would be good for searching notebooks. i know pandas so we are going to use pandas to load in our data
cells
27 total
14 code
state
executed in order
kernel
Python [conda env:root] *
language
python
name
conda-root-py
lines of code
88
outputs
3
table of contents
{"kernelspec": {"display_name": "Python [conda env:root] *", "language": "python", "name": "conda-root-py"}, "language_info": {"codemirror_mode": {"name": "ipython", "version": 3}, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13"}, "widgets": {"application/vnd.jupyter.widget-state+json": {"state": {}, "version_major": 2, "version_minor": 0}}, "title": "full text search for notebooks and files using duckdb", "description": "duckdb is great for moderate sized data.\nmaybe it would be good for searching notebooks.\ni know pandas so we are going to use pandas to load in our data"}
notebook toolbar
Activate
cell ordering
1

full text search for notebooks and files using duckdb

duckdb is great for moderate sized data. maybe it would be good for searching notebooks. i know pandas so we are going to use pandas to load in our data

  1. reads files
  2. load contents in the nbformat
  3. create the table on a in memory duckdb
  4. at full text search the columns
  5. search the source
2
    import pandas, duckdb, functools
3

search

the use of the search is demonstrated at the end of the document

4
    def search(q) -> pandas.DataFrame:
        return (get_db().execute(F"""
        SELECT * FROM
        (
            SELECT *, fts_main_cells.match_bm25(path, '{q}', fields:='source') AS score FROM cells
        )
        WHERE score IS NOT NULL
        ORDER BY score DESC;
        """)).df()
5

https://duckdb.org/docs/extensions/full_text_search

6
    @functools.lru_cache # this makes our function a singleton
    def get_db() -> duckdb.DuckDBPyConnection:
        con = duckdb.connect()
        con.execute("CREATE TABLE cells AS SELECT * FROM sources")
        con.execute("INSERT INTO cells SELECT * FROM sources")
        con.execute("""PRAGMA create_fts_index('cells', 'path', 'source');""")
        return con    
7

create a shape of the cells that duckdb can use. we ignore metadata, attachments and outputs.

8
    def get_fts_sources(cells):
        sources = cells.drop(columns=["metadata", "attachments", "outputs"])
        sources.source = sources.source.str.join("")
        sources = sources.set_index(sources.index.map(compose_left(map(str), "#/cells/".join)).rename("path")).reset_index()
        sources.execution_count = sources.execution_count.fillna(-1)
        return sources
9

load all the documents in as cells

10
    def get_cells(docs):    
        return (
            docs["cells"].apply(
                compose_left(enumerate, list)
            ).explode().apply(pandas.Series)
            .rename(columns={0: "cell_ct", 1: "cell"})
            .set_index("cell_ct", append=True)["cell"]
            .apply(pandas.Series)
        )
11

get_files creates our first dataframes

12
    def get_files(dir) -> pandas.DataFrame:
        files = pandas.DataFrame(index=pandas.Index(iter_files(dir), name="file"))
        return files.assign(suffix=files.index.map(operator.attrgetter("suffix")))
13

get_markdown_file reads a markdown file as a markdown notebook cell.

14
    def get_markdown_file(md):
        import nbformat
        return nbformat.v4.new_notebook(cells=[nbformat.v4.new_markdown_cell(md)])    
15
    def get_docs(files: pandas.DataFrame) -> pandas.DataFrame:
        files = files.assign(text=files.index.map(pathlib.Path.read_text))
        return pandas.concat([
            files[files.suffix.eq(".ipynb")].text.apply(compose_left(orjson.loads, pandas.Series)),
            files[files.suffix.eq(".md")].text.apply(compose_left(get_markdown_file, pandas.Series)),        
        ])
16
    def get_cells_frame(dir): return get_cells(get_docs(get_files(dir))) 
17

iter_files finds files matching an include pattern, and not matching an exclude pattern

18
    def iter_files(dir=None, exclude=".nox\n.ipynb_checkpoints\n", include="*.md\n*.ipynb"):
        import pathspec
        exclude_spec = pathspec.PathSpec.from_lines(pathspec.GitIgnorePattern, exclude.splitlines())
        include_spec = pathspec.PathSpec.from_lines(pathspec.GitIgnorePattern, include.splitlines())            
        dir = pathlib.Path(dir or pathlib.Path.cwd())
        for f in dir.iterdir():
            if f.is_dir():
                if not exclude_spec.match_file(f):
                    yield from iter_files(f)
            if f.is_file():
                if include_spec.match_file(f):
                    if not exclude_spec.match_file(f):
                        yield f
19

iter_files uses a pattern i like where pathspec defines the files included and excluded. sometimes include/exclude logic can be confusing. the .gitignore convention is adopted to rely on that and point someone else's docs.

20

using our search function

21
    import pathspec, dataclasses, orjson, pathlib; from toolz.curried import *
22

initialize the pandas.DataFrame so duckdb can use it. our table in this work is cells

23

initialize the tables from pandas

https://duckdb.org/docs/guides/python/import_pandas.html

24
    if (I := "__file__" not in locals()):
        sources = get_fts_sources(get_cells_frame(".."))
        display(get_db().execute("DESCRIBE cells").df())
1 outputs.
column_name column_type null key default extra
0 path VARCHAR YES NaN NaN NaN
1 cell_type VARCHAR YES NaN NaN NaN
2 id VARCHAR YES NaN NaN NaN
3 source VARCHAR YES NaN NaN NaN
4 execution_count DOUBLE YES NaN NaN NaN
25

sample searches

26
    I and display(search("pandas").head())
1 outputs.
path cell_type id source execution_count score
0 ../xxii/oct/2022-10-29-metadata-formatter.ipyn... code e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0 if ACTIVE:\n import pandas\n ... 8.0 1.718594
1 ../xxii/oct/2022-10-29-metadata-formatter.ipyn... code e3a8b43e-aaeb-4b7a-9ccb-4485ae0689a0 if ACTIVE:\n import pandas\n ... 8.0 1.718594
2 ../xxiii/2023-01-02-accessible-dataframes-basi... code 401913ff-534f-4659-aec5-0784b1f1f34c (df := pandas.DataFrame(\n columns=... 2.0 1.679535
3 ../xxiii/2023-01-11-accessible-dataframes-comp... code 401913ff-534f-4659-aec5-0784b1f1f34c (df := pandas.DataFrame(\n columns=... 2.0 1.679535
4 ../xxiii/2023-01-02-accessible-dataframes-basi... code 401913ff-534f-4659-aec5-0784b1f1f34c (df := pandas.DataFrame(\n columns=... 2.0 1.679535
27
    I and display(search("toolz").head(4))
1 outputs.
path cell_type id source execution_count score
0 ../xxii/oct/colormap-dataframes/2021-10-11-col... code 391cab50-209e-4843-8bea-0405f6734e6f import pandas, numpy, toolz.curried as toolz 1.0 3.776350
1 ../xxii/oct/colormap-dataframes/2021-10-11-col... code 391cab50-209e-4843-8bea-0405f6734e6f import pandas, numpy, toolz.curried as toolz 1.0 3.776350
2 ../xxiii/2023-01-11-duckdb-search.ipynb#/cells/20 code d3a6ca2a-7b1d-4f0a-a86c-9345913468c0 import pathspec, dataclasses, orjson, path... -1.0 3.340617
3 ../xxiii/2023-01-11-duckdb-search.ipynb#/cells/26 code 589d3fb6-a4bb-434f-a06e-05d57fe57f09 I and display(search("toolz").head(4)) -1.0 3.340617