use whoosh
to search cells/articles on disk
https://whoosh.readthedocs.io/en/latest/
!pip install whoosh
import whoosh.fields, whoosh.index, whoosh.qparser, whoosh.writing
import pathlib, shutil
from tonyfast import nbframe
__import__("nest_asyncio").apply()
self = nbframe.Documents(nbframe.Finder(dir="..")).load()
initialize the search index
INDEX = pathlib.Path("search_index")
INDEX.mkdir(exist_ok=True)
whoosh.index.create_in(INDEX, schema := whoosh.fields.Schema(source=whoosh.fields.TEXT, path=whoosh.fields.ID(stored=True)))
index=whoosh.index.open_dir(INDEX)
from tonyfast import nbframe
self = nbframe.Documents(nbframe.Finder(dir="..")).load()
self.articles
is a dataframe containing notebooks and files cast to the notebook schema. the dask and dataframes are shown below.
display(self.articles, self.articles.head(10, 5))
Dask DataFrame Structure:
|
cell_type |
execution_count |
id |
metadata |
outputs |
source |
cell_ct |
attachments |
npartitions=85 |
|
|
|
|
|
|
|
|
../2023-01-19-.ipynb |
object |
int64 |
object |
object |
object |
object |
int64 |
object |
../2023-01-19-pidgy-afforndances.ipynb |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
... |
../xxiii/vendor/tree-sitter-python/test/highlight/pattern_matching.py |
... |
... |
... |
... |
... |
... |
... |
... |
../xxiii/what.md |
... |
... |
... |
... |
... |
... |
... |
... |
Dask Name: apply, 1 graph layer
|
cell_type |
execution_count |
id |
metadata |
outputs |
source |
cell_ct |
attachments |
path |
|
|
|
|
|
|
|
|
../2023-01-19-.ipynb |
code |
None |
ad5f3630-daac-4b8b-95b0-22f27ea47af2 |
{} |
[] |
|
0 |
None |
../2023-01-19-pidgy-afforndances.ipynb |
code |
1.0 |
409a2348-866f-4127-a25b-7fc0adcac5fc |
{} |
[{'ename': 'SyntaxError', 'evalue': 'invalid s... |
when i program in `pidgy`\n\n* `sys.modules` a... |
0 |
None |
../2023-01-19-pidgy-afforndances.ipynb |
code |
1.0 |
45c4aa5d-e53d-4a02-82b2-8f872906ceba |
{} |
[{'data': {'text/markdown': ' %reload_ext p... |
%reload_ext pidgy\n from toolz.curried ... |
1 |
None |
../2023-01-19-pidgy-afforndances.ipynb |
markdown |
NaN |
87b3dd7c-3b9a-406d-b39c-547c69a938f7 |
{} |
None |
<iframe src="http://127.0.0.1:8787/status"... |
2 |
None |
../2023-01-19-pidgy-afforndances.ipynb |
markdown |
NaN |
d3d234cd-7d46-44b2-b26f-a80e524764ec |
{} |
None |
# start the contents finder |
3 |
None |
../2023-01-19-pidgy-afforndances.ipynb |
code |
1.0 |
3563e7c1-6f4f-4774-84c9-f2db766238cb |
{} |
[{'data': {'text/html': '<div>
<div style=... |
\n %reload_ext pidgy\n import nbfram... |
4 |
None |
../2023-01-19-pidgy-afforndances.ipynb |
code |
11.0 |
4b4f8e4c-d192-4dd8-818e-b728d4b6673e |
{} |
[{'data': {'text/html': '<div>
<style scoped>
... |
result |
5 |
None |
../2023-01-19-pidgy-afforndances.ipynb |
code |
21.0 |
1fba636a-14c1-4011-8af1-5d54452f7e36 |
{} |
[{'data': {'text/markdown': ' pretty neat that... |
{{asyncio.sleep(1) or ""}} pretty neat that we... |
6 |
None |
../2023-01-19-pidgy-afforndances.ipynb |
code |
18.0 |
efdfc300-da33-40d5-b1ea-8636e8e1a001 |
{} |
[{'data': {'text/markdown': ' docs= 2', 'te... |
docs= 2 |
7 |
None |
../2023-01-19-pidgy-afforndances.ipynb |
markdown |
NaN |
bad77dd2-294b-4f6a-a520-4ff25be27d41 |
{} |
None |
load and persist the data |
8 |
None |
def get_article_path(s): return str(s.name) + "#/cells/" + str(s.cell_ct)
self.articles["path"] = self.articles.apply(get_article_path, meta=("path", "O"), axis=1)
def write_documents(df):
with whoosh.writing.AsyncWriter(index) as w:
for _, x in df.iterrows(): w.add_document(**x)
self.articles[["source", "path"]].applymap("".join).groupby(self.articles.index).apply(write_documents, meta=("none", int)).compute()
Series([], Name: none, dtype: int64)
querying the documents
query = whoosh.qparser.QueryParser("source", schema)
with index.searcher() as search:
print(search.search(query.parse("literate computing")))
<Top 0 Results for And([Term('source', 'literate'), Term('source', 'computing')]) runtime=9.207999937643763e-05>