Skip to content

use whoosh to search cells/articles on disk¤

https://whoosh.readthedocs.io/en/latest/

!pip install  whoosh
    import whoosh.fields, whoosh.index, whoosh.qparser, whoosh.writing
    import pathlib, shutil
    from tonyfast import nbframe
    __import__("nest_asyncio").apply()
    self = nbframe.Documents(nbframe.Finder(dir="..")).load()    

initialize the search index¤

    INDEX = pathlib.Path("search_index")
    INDEX.mkdir(exist_ok=True)

    whoosh.index.create_in(INDEX, schema := whoosh.fields.Schema(source=whoosh.fields.TEXT, path=whoosh.fields.ID(stored=True)))
    index=whoosh.index.open_dir(INDEX)
    from tonyfast import nbframe
    self = nbframe.Documents(nbframe.Finder(dir="..")).load()    

self.articles is a dataframe containing notebooks and files cast to the notebook schema. the dask and dataframes are shown below.

    display(self.articles, self.articles.head(10, 5))
Dask DataFrame Structure:
cell_type execution_count id metadata outputs source cell_ct attachments
npartitions=85
../2023-01-19-.ipynb object int64 object object object object int64 object
../2023-01-19-pidgy-afforndances.ipynb ... ... ... ... ... ... ... ...
... ... ... ... ... ... ... ... ...
../xxiii/vendor/tree-sitter-python/test/highlight/pattern_matching.py ... ... ... ... ... ... ... ...
../xxiii/what.md ... ... ... ... ... ... ... ...
Dask Name: apply, 1 graph layer
cell_type execution_count id metadata outputs source cell_ct attachments
path
../2023-01-19-.ipynb code None ad5f3630-daac-4b8b-95b0-22f27ea47af2 {} [] 0 None
../2023-01-19-pidgy-afforndances.ipynb code 1.0 409a2348-866f-4127-a25b-7fc0adcac5fc {} [{'ename': 'SyntaxError', 'evalue': 'invalid s... when i program in `pidgy`\n\n* `sys.modules` a... 0 None
../2023-01-19-pidgy-afforndances.ipynb code 1.0 45c4aa5d-e53d-4a02-82b2-8f872906ceba {} [{'data': {'text/markdown': ' %reload_ext p... %reload_ext pidgy\n from toolz.curried ... 1 None
../2023-01-19-pidgy-afforndances.ipynb markdown NaN 87b3dd7c-3b9a-406d-b39c-547c69a938f7 {} None <iframe src="http://127.0.0.1:8787/status"... 2 None
../2023-01-19-pidgy-afforndances.ipynb markdown NaN d3d234cd-7d46-44b2-b26f-a80e524764ec {} None # start the contents finder 3 None
../2023-01-19-pidgy-afforndances.ipynb code 1.0 3563e7c1-6f4f-4774-84c9-f2db766238cb {} [{'data': {'text/html': '<div> <div style=... \n %reload_ext pidgy\n import nbfram... 4 None
../2023-01-19-pidgy-afforndances.ipynb code 11.0 4b4f8e4c-d192-4dd8-818e-b728d4b6673e {} [{'data': {'text/html': '<div> <style scoped> ... result 5 None
../2023-01-19-pidgy-afforndances.ipynb code 21.0 1fba636a-14c1-4011-8af1-5d54452f7e36 {} [{'data': {'text/markdown': ' pretty neat that... {{asyncio.sleep(1) or ""}} pretty neat that we... 6 None
../2023-01-19-pidgy-afforndances.ipynb code 18.0 efdfc300-da33-40d5-b1ea-8636e8e1a001 {} [{'data': {'text/markdown': ' docs= 2', 'te... docs= 2 7 None
../2023-01-19-pidgy-afforndances.ipynb markdown NaN bad77dd2-294b-4f6a-a520-4ff25be27d41 {} None load and persist the data 8 None
    def get_article_path(s): return str(s.name) + "#/cells/" + str(s.cell_ct)
    self.articles["path"] = self.articles.apply(get_article_path, meta=("path", "O"), axis=1)
    def write_documents(df):
        with whoosh.writing.AsyncWriter(index) as w:
            for _, x in df.iterrows(): w.add_document(**x)
    self.articles[["source", "path"]].applymap("".join).groupby(self.articles.index).apply(write_documents, meta=("none", int)).compute()
Series([], Name: none, dtype: int64)

querying the documents¤

    query = whoosh.qparser.QueryParser("source", schema)
    with index.searcher() as search:
        print(search.search(query.parse("literate computing")))
<Top 0 Results for And([Term('source', 'literate'), Term('source', 'computing')]) runtime=9.207999937643763e-05>