index
|
execution_count
|
cell_type
|
toolbar
|
started_at
|
completed_at
|
source
|
loc
|
metadata
|
outputs
|
1
|
|
|
|
|
|
# use `whoosh` to search cells/articles on disk
|
|
1
|
|
2
|
|
|
|
|
|
https://whoosh.readthedocs.io/en/latest/
!pip install whoosh
|
|
3
|
https://whoosh.readthedocs.io/en/latest/
!pip install whoosh
|
3
|
|
|
|
|
|
import whoosh.fields, whoosh.index, whoosh.qparser, whoosh.writing
import pathlib, shutil
|
|
2
|
0 outputs.
|
4
|
|
|
|
|
|
from tonyfast import nbframe
__import__("nest_asyncio").apply()
self = nbframe.Documents(nbframe.Finder(dir="..")).load()
|
|
3
|
0 outputs.
|
5
|
|
|
|
|
|
## initialize the search index
|
|
1
|
|
6
|
|
|
|
|
|
INDEX = pathlib.Path("search_index")
INDEX.mkdir(exist_ok=True)
whoosh.index.create_in(INDEX, schema := whoosh.fields.Schema(source=whoosh.fields.TEXT, path=whoosh.fields.ID(stored=True)))
index=whoosh.index.open_dir(INDEX)
|
|
5
|
0 outputs.
|
7
|
|
|
|
|
|
from tonyfast import nbframe
self = nbframe.Documents(nbframe.Finder(dir="..")).load()
|
|
2
|
0 outputs.
|
8
|
|
|
|
|
|
`self.articles` is a dataframe containing notebooks and files cast to the notebook schema. the dask and dataframes are shown below.
|
|
1
|
self.articles
is a dataframe containing notebooks and files cast to the notebook schema. the dask and dataframes are shown below.
|
9
|
|
|
|
|
|
display(self.articles, self.articles.head(10, 5))
|
|
1
|
2 outputs.
|
10
|
|
|
|
|
|
def get_article_path(s): return str(s.name) + "#/cells/" + str(s.cell_ct)
self.articles["path"] = self.articles.apply(get_article_path, meta=("path", "O"), axis=1)
|
|
2
|
0 outputs.
|
11
|
|
|
|
|
|
def write_documents(df):
with whoosh.writing.AsyncWriter(index) as w:
for _, x in df.iterrows(): w.add_document(**x)
self.articles[["source", "path"]].applymap("".join).groupby(self.articles.index).apply(write_documents, meta=("none", int)).compute()
|
|
5
|
1 outputs.
|
12
|
|
|
|
|
|
## querying the documents
|
|
1
|
|
13
|
|
|
|
|
|
query = whoosh.qparser.QueryParser("source", schema)
|
|
1
|
0 outputs.
|
14
|
|
|
|
|
|
with index.searcher() as search:
print(search.search(query.parse("literate computing")))
|
|
2
|
1 outputs.
|