full text search for notebooks and files using duckdb¤
duckdb
is great for moderate sized data.
maybe it would be good for searching notebooks.
i know pandas
so we are going to use pandas
to load in our data
- reads files
- load contents in the
nbformat
- create the table on a in memory duckdb
- at full text search the columns
- search the source
import pandas, duckdb, functools
search
is our database goal¤
the use of the search
is demonstrated at the end of the document
def search(q) -> pandas.DataFrame:
return (get_db().execute(F"""
SELECT * FROM
(
SELECT *, fts_main_cells.match_bm25(path, '{q}', fields:='source') AS score FROM cells
)
WHERE score IS NOT NULL
ORDER BY score DESC;
""")).df()
https://duckdb.org/docs/extensions/full_text_search
@functools.lru_cache # this makes our function a singleton
def get_db() -> duckdb.DuckDBPyConnection:
con = duckdb.connect()
con.execute("CREATE TABLE cells AS SELECT * FROM sources")
con.execute("INSERT INTO cells SELECT * FROM sources")
con.execute("""PRAGMA create_fts_index('cells', 'path', 'source');""")
return con
create a shape of the cells that duckdb can use. we ignore metadata, attachments and outputs.
def get_fts_sources(cells):
sources = cells.drop(columns=["metadata", "attachments", "outputs"])
sources.source = sources.source.str.join("")
sources = sources.set_index(sources.index.map(compose_left(map(str), "#/cells/".join)).rename("path")).reset_index()
sources.execution_count = sources.execution_count.fillna(-1)
return sources
load all the documents in as cells¤
def get_cells(docs):
return (
docs["cells"].apply(
compose_left(enumerate, list)
).explode().apply(pandas.Series)
.rename(columns={0: "cell_ct", 1: "cell"})
.set_index("cell_ct", append=True)["cell"]
.apply(pandas.Series)
)
get_files
creates our first dataframes
def get_files(dir) -> pandas.DataFrame:
files = pandas.DataFrame(index=pandas.Index(iter_files(dir), name="file"))
return files.assign(suffix=files.index.map(operator.attrgetter("suffix")))
get_markdown_file
reads a markdown file as a markdown notebook cell.
def get_markdown_file(md):
import nbformat
return nbformat.v4.new_notebook(cells=[nbformat.v4.new_markdown_cell(md)])
def get_docs(files: pandas.DataFrame) -> pandas.DataFrame:
files = files.assign(text=files.index.map(pathlib.Path.read_text))
return pandas.concat([
files[files.suffix.eq(".ipynb")].text.apply(compose_left(orjson.loads, pandas.Series)),
files[files.suffix.eq(".md")].text.apply(compose_left(get_markdown_file, pandas.Series)),
])
def get_cells_frame(dir): return get_cells(get_docs(get_files(dir)))
iter_files
finds files matching an include pattern, and not matching an exclude pattern
def iter_files(dir=None, exclude=".nox\n.ipynb_checkpoints\n", include="*.md\n*.ipynb"):
import pathspec
exclude_spec = pathspec.PathSpec.from_lines(pathspec.GitIgnorePattern, exclude.splitlines())
include_spec = pathspec.PathSpec.from_lines(pathspec.GitIgnorePattern, include.splitlines())
dir = pathlib.Path(dir or pathlib.Path.cwd())
for f in dir.iterdir():
if f.is_dir():
if not exclude_spec.match_file(f):
yield from iter_files(f)
if f.is_file():
if include_spec.match_file(f):
if not exclude_spec.match_file(f):
yield f
iter_files
uses a pattern i like where pathspec
defines the files included and excluded.
sometimes include/exclude logic can be confusing. the .gitignore
convention is adopted to rely on that and point someone else's docs.
using our search function¤
import pathspec, dataclasses, orjson, pathlib; from toolz.curried import *
initialize the pandas.DataFrame
so duckdb
can use it. our table in this work is cells
initialize the duckdb
tables from pandas¤
https://duckdb.org/docs/guides/python/import_pandas.html
if (I := "__file__" not in locals()):
sources = get_fts_sources(get_cells_frame(".."))
display(get_db().execute("DESCRIBE cells").df())
sample searches¤
I and display(search("pandas").head())
I and display(search("toolz").head(4))