Merge pull request #255 from urschrei/shugel/push-ywosmxprlrrt · urschrei.eurosky.social/pyzotero@29fec29

urschrei.eurosky.social / pyzotero

fork atom

Pyzotero: a Python client for the Zotero API pyzotero.readthedocs.io

zotero

fork atom

Merge pull request #255 from urschrei/shugel/push-ywosmxprlrrt

Fix fulltext search to include parent items of matching attachments

authored by urschrei.eurosky.social and committed by

GitHub 5 months ago 29fec29a 4776d1c7

+60 -5

5 changed files

expand all

README.md

doc

index.rst

pyproject.toml

src

pyzotero

cli.py

uv.lock

README.md

··· 59 59 pyzotero itemtypes 60 60 ``` 61 61 62 + ## Search Behaviour 63 + 64 + By default, `pyzotero search` searches only top-level item titles and metadata fields. 65 + 66 + When the `--fulltext` flag is used, the search expands to include all full-text indexed content, including PDFs and other attachments. Since most full-text content comes from PDF attachments rather than top-level items, the CLI automatically retrieves the parent bibliographic items for any matching attachments. This ensures you receive useful bibliographic records (journal articles, books, etc.) rather than raw attachment items. 67 + 62 68 ## Output Format 63 69 64 70 By default, the CLI outputs human-readable text with a subset of metadata including:

doc/index.rst

··· 152 152 153 153 pyzotero itemtypes 154 154 155 + Search Behaviour 156 + ---------------- 157 + 158 + By default, ``pyzotero search`` searches only top-level item titles and metadata fields. 159 + 160 + When the ``--fulltext`` flag is used, the search expands to include all full-text indexed content, including PDFs and other attachments. Since most full-text content comes from PDF attachments rather than top-level items, the CLI automatically retrieves the parent bibliographic items for any matching attachments. This ensures you receive useful bibliographic records (journal articles, books, etc.) rather than raw attachment items. 161 + 155 162 Output Format 156 163 ------------- 157 164

+1 -1

pyproject.toml

··· 1 1 [project] 2 2 name = "pyzotero" 3 - version = "1.7.0" 3 + version = "1.7.1" 4 4 description = "Python wrapper for the Zotero API" 5 5 readme = "README.md" 6 6 requires-python = ">=3.9"

+45 -3

src/pyzotero/cli.py

··· 6 6 import click 7 7 8 8 from pyzotero import zotero 9 + from pyzotero.zotero import chunks 9 10 10 11 11 12 def _get_zotero_client(locale="en-US"): ··· 36 37 @click.option( 37 38 "--fulltext", 38 39 is_flag=True, 39 - help="Enable full-text search (qmode='everything')", 40 + help="Search full-text content including PDFs. Retrieves parent items when attachments match.", 40 41 ) 41 42 @click.option( 42 43 "--itemtype", ··· 63 64 def search(ctx, query, fulltext, itemtype, collection, limit, output_json): # noqa: PLR0912, PLR0915 64 65 """Search local Zotero library. 65 66 67 + By default, searches top-level items in titles and metadata. 68 + 69 + When --fulltext is enabled, searches all items including attachment content 70 + (PDFs, documents, etc.). If a match is found in an attachment, the parent 71 + bibliographic item is retrieved and included in results. 72 + 66 73 Examples: 67 74 pyzotero search -q "machine learning" 68 75 ··· 92 99 # Join multiple item types with || for OR search 93 100 params["itemType"] = " || ".join(itemtype) 94 101 95 - # Execute search using collection_items_top() if collection specified, otherwise top() 96 - if collection: 102 + # Execute search 103 + # When fulltext is enabled, use items() or collection_items() to get both 104 + # top-level items and attachments. Otherwise use top() or collection_items_top() 105 + # to only get top-level items. 106 + if fulltext: 107 + if collection: 108 + results = zot.collection_items(collection, **params) 109 + else: 110 + results = zot.items(**params) 111 + 112 + # When using fulltext, we need to retrieve parent items for any attachments 113 + # that matched, since most full-text content comes from PDFs and other attachments 114 + top_level_items = [] 115 + attachment_items = [] 116 + 117 + for item in results: 118 + data = item.get("data", {}) 119 + if "parentItem" in data: 120 + attachment_items.append(item) 121 + else: 122 + top_level_items.append(item) 123 + 124 + # Retrieve parent items for attachments in batches of 50 125 + parent_items = [] 126 + if attachment_items: 127 + parent_ids = list( 128 + {item["data"]["parentItem"] for item in attachment_items} 129 + ) 130 + for chunk in chunks(parent_ids, 50): 131 + parent_items.extend(zot.get_subset(chunk)) 132 + 133 + # Combine top-level items and parent items, removing duplicates by key 134 + all_items = top_level_items + parent_items 135 + items_dict = {item["data"]["key"]: item for item in all_items} 136 + results = list(items_dict.values()) 137 + # Non-fulltext search: use top() or collection_items_top() as before 138 + elif collection: 97 139 results = zot.collection_items_top(collection, **params) 98 140 else: 99 141 results = zot.top(**params)

+1 -1

uv.lock

··· 784 784 785 785 [[package]] 786 786 name = "pyzotero" 787 - version = "1.7.0" 787 + version = "1.7.1" 788 788 source = { editable = "." } 789 789 dependencies = [ 790 790 { name = "bibtexparser" },