Add Semantic Scholar API client and CLI commands (related, citations, references, s2search)

urschrei.eurosky.social / pyzotero

fork atom

Pyzotero: a Python client for the Zotero API pyzotero.readthedocs.io

zotero

fork atom

Add Semantic Scholar API client and CLI commands (related, citations, references, s2search)

Signed-off-by: Stephan Hügel <shugel@tcd.ie>

authored by urschrei.eurosky.social and committed by urschrei.eurosky.social 2 months ago bc3fdb2e 29cedf8d

+860

2 changed files

expand all

src

pyzotero

cli.py

semantic_scholar.py

+419

src/pyzotero/cli.py

··· 7 7 import httpx 8 8 9 9 from pyzotero import __version__, zotero 10 + from pyzotero.semantic_scholar import ( 11 + PaperNotFoundError, 12 + RateLimitError, 13 + SemanticScholarError, 14 + filter_by_citations, 15 + get_citations, 16 + get_recommendations, 17 + get_references, 18 + search_papers, 19 + ) 10 20 from pyzotero.zotero import chunks 11 21 12 22 ··· 508 518 for doi in not_found: 509 519 click.echo(f" {doi}") 510 520 521 + except Exception as e: 522 + click.echo(f"Error: {e!s}", err=True) 523 + sys.exit(1) 524 + 525 + 526 + def _build_doi_index(zot): 527 + """Build a mapping of normalised DOIs to Zotero item keys. 528 + 529 + Returns: 530 + Dict mapping normalised DOIs to item keys 531 + 532 + """ 533 + doi_map = {} 534 + all_items = zot.everything(zot.items()) 535 + 536 + for item in all_items: 537 + data = item.get("data", {}) 538 + item_doi = data.get("DOI", "") 539 + 540 + if item_doi: 541 + normalised_doi = _normalize_doi(item_doi) 542 + item_key = data.get("key", "") 543 + 544 + if normalised_doi and item_key: 545 + doi_map[normalised_doi] = item_key 546 + 547 + return doi_map 548 + 549 + 550 + def _format_s2_paper(paper, in_library=None): 551 + """Format a Semantic Scholar paper for output. 552 + 553 + Args: 554 + paper: Normalised paper dict from semantic_scholar module 555 + in_library: Boolean indicating if paper is in local Zotero 556 + 557 + Returns: 558 + Formatted dict for output 559 + 560 + """ 561 + result = { 562 + "paperId": paper.get("paperId"), 563 + "doi": paper.get("doi"), 564 + "title": paper.get("title"), 565 + "authors": [a.get("name") for a in (paper.get("authors") or [])], 566 + "year": paper.get("year"), 567 + "venue": paper.get("venue"), 568 + "citationCount": paper.get("citationCount"), 569 + "referenceCount": paper.get("referenceCount"), 570 + "isOpenAccess": paper.get("isOpenAccess"), 571 + "openAccessPdfUrl": paper.get("openAccessPdfUrl"), 572 + } 573 + 574 + if in_library is not None: 575 + result["inLibrary"] = in_library 576 + 577 + return result 578 + 579 + 580 + def _annotate_with_library(papers, doi_map): 581 + """Annotate papers with in_library status based on DOI matching. 582 + 583 + Args: 584 + papers: List of normalised paper dicts 585 + doi_map: Dict mapping normalised DOIs to Zotero item keys 586 + 587 + Returns: 588 + List of formatted paper dicts with inLibrary field 589 + 590 + """ 591 + results = [] 592 + for paper in papers: 593 + doi = paper.get("doi") 594 + in_library = False 595 + if doi: 596 + normalised = _normalize_doi(doi) 597 + in_library = normalised in doi_map 598 + results.append(_format_s2_paper(paper, in_library)) 599 + return results 600 + 601 + 602 + @main.command() 603 + @click.option( 604 + "--doi", 605 + required=True, 606 + help="DOI of the paper to find related papers for", 607 + ) 608 + @click.option( 609 + "--limit", 610 + type=int, 611 + default=20, 612 + help="Maximum number of results to return (default: 20, max: 500)", 613 + ) 614 + @click.option( 615 + "--min-citations", 616 + type=int, 617 + default=0, 618 + help="Minimum citation count filter (default: 0)", 619 + ) 620 + @click.option( 621 + "--check-library/--no-check-library", 622 + default=True, 623 + help="Check if papers exist in local Zotero (default: True)", 624 + ) 625 + @click.pass_context 626 + def related(ctx, doi, limit, min_citations, check_library): 627 + """Find papers related to a given paper using Semantic Scholar. 628 + 629 + Uses SPECTER2 embeddings to find semantically similar papers. 630 + 631 + Examples: 632 + pyzotero related --doi "10.1038/nature12373" 633 + 634 + pyzotero related --doi "10.1038/nature12373" --limit 50 635 + 636 + pyzotero related --doi "10.1038/nature12373" --min-citations 100 637 + 638 + """ 639 + try: 640 + # Get recommendations from Semantic Scholar 641 + click.echo(f"Fetching related papers for DOI: {doi}...", err=True) 642 + result = get_recommendations(doi, id_type="doi", limit=limit) 643 + papers = result.get("papers", []) 644 + 645 + # Apply citation filter 646 + if min_citations > 0: 647 + papers = filter_by_citations(papers, min_citations) 648 + 649 + if not papers: 650 + click.echo(json.dumps({"count": 0, "papers": []})) 651 + return 652 + 653 + # Optionally annotate with library status 654 + if check_library: 655 + click.echo("Checking local Zotero library...", err=True) 656 + locale = ctx.obj.get("locale", "en-US") 657 + zot = _get_zotero_client(locale) 658 + doi_map = _build_doi_index(zot) 659 + output_papers = _annotate_with_library(papers, doi_map) 660 + else: 661 + output_papers = [_format_s2_paper(p) for p in papers] 662 + 663 + click.echo( 664 + json.dumps({"count": len(output_papers), "papers": output_papers}, indent=2) 665 + ) 666 + 667 + except PaperNotFoundError: 668 + click.echo("Error: Paper not found in Semantic Scholar.", err=True) 669 + sys.exit(1) 670 + except RateLimitError: 671 + click.echo("Error: Rate limit exceeded. Please wait and try again.", err=True) 672 + sys.exit(1) 673 + except SemanticScholarError as e: 674 + click.echo(f"Error: {e!s}", err=True) 675 + sys.exit(1) 676 + except Exception as e: 677 + click.echo(f"Error: {e!s}", err=True) 678 + sys.exit(1) 679 + 680 + 681 + @main.command() 682 + @click.option( 683 + "--doi", 684 + required=True, 685 + help="DOI of the paper to find citations for", 686 + ) 687 + @click.option( 688 + "--limit", 689 + type=int, 690 + default=100, 691 + help="Maximum number of results to return (default: 100, max: 1000)", 692 + ) 693 + @click.option( 694 + "--min-citations", 695 + type=int, 696 + default=0, 697 + help="Minimum citation count filter (default: 0)", 698 + ) 699 + @click.option( 700 + "--check-library/--no-check-library", 701 + default=True, 702 + help="Check if papers exist in local Zotero (default: True)", 703 + ) 704 + @click.pass_context 705 + def citations(ctx, doi, limit, min_citations, check_library): 706 + """Find papers that cite a given paper using Semantic Scholar. 707 + 708 + Examples: 709 + pyzotero citations --doi "10.1038/nature12373" 710 + 711 + pyzotero citations --doi "10.1038/nature12373" --limit 50 712 + 713 + pyzotero citations --doi "10.1038/nature12373" --min-citations 50 714 + 715 + """ 716 + try: 717 + # Get citations from Semantic Scholar 718 + click.echo(f"Fetching citations for DOI: {doi}...", err=True) 719 + result = get_citations(doi, id_type="doi", limit=limit) 720 + papers = result.get("papers", []) 721 + 722 + # Apply citation filter 723 + if min_citations > 0: 724 + papers = filter_by_citations(papers, min_citations) 725 + 726 + if not papers: 727 + click.echo(json.dumps({"count": 0, "papers": []})) 728 + return 729 + 730 + # Optionally annotate with library status 731 + if check_library: 732 + click.echo("Checking local Zotero library...", err=True) 733 + locale = ctx.obj.get("locale", "en-US") 734 + zot = _get_zotero_client(locale) 735 + doi_map = _build_doi_index(zot) 736 + output_papers = _annotate_with_library(papers, doi_map) 737 + else: 738 + output_papers = [_format_s2_paper(p) for p in papers] 739 + 740 + click.echo( 741 + json.dumps({"count": len(output_papers), "papers": output_papers}, indent=2) 742 + ) 743 + 744 + except PaperNotFoundError: 745 + click.echo("Error: Paper not found in Semantic Scholar.", err=True) 746 + sys.exit(1) 747 + except RateLimitError: 748 + click.echo("Error: Rate limit exceeded. Please wait and try again.", err=True) 749 + sys.exit(1) 750 + except SemanticScholarError as e: 751 + click.echo(f"Error: {e!s}", err=True) 752 + sys.exit(1) 753 + except Exception as e: 754 + click.echo(f"Error: {e!s}", err=True) 755 + sys.exit(1) 756 + 757 + 758 + @main.command() 759 + @click.option( 760 + "--doi", 761 + required=True, 762 + help="DOI of the paper to find references for", 763 + ) 764 + @click.option( 765 + "--limit", 766 + type=int, 767 + default=100, 768 + help="Maximum number of results to return (default: 100, max: 1000)", 769 + ) 770 + @click.option( 771 + "--min-citations", 772 + type=int, 773 + default=0, 774 + help="Minimum citation count filter (default: 0)", 775 + ) 776 + @click.option( 777 + "--check-library/--no-check-library", 778 + default=True, 779 + help="Check if papers exist in local Zotero (default: True)", 780 + ) 781 + @click.pass_context 782 + def references(ctx, doi, limit, min_citations, check_library): 783 + """Find papers referenced by a given paper using Semantic Scholar. 784 + 785 + Examples: 786 + pyzotero references --doi "10.1038/nature12373" 787 + 788 + pyzotero references --doi "10.1038/nature12373" --limit 50 789 + 790 + pyzotero references --doi "10.1038/nature12373" --min-citations 100 791 + 792 + """ 793 + try: 794 + # Get references from Semantic Scholar 795 + click.echo(f"Fetching references for DOI: {doi}...", err=True) 796 + result = get_references(doi, id_type="doi", limit=limit) 797 + papers = result.get("papers", []) 798 + 799 + # Apply citation filter 800 + if min_citations > 0: 801 + papers = filter_by_citations(papers, min_citations) 802 + 803 + if not papers: 804 + click.echo(json.dumps({"count": 0, "papers": []})) 805 + return 806 + 807 + # Optionally annotate with library status 808 + if check_library: 809 + click.echo("Checking local Zotero library...", err=True) 810 + locale = ctx.obj.get("locale", "en-US") 811 + zot = _get_zotero_client(locale) 812 + doi_map = _build_doi_index(zot) 813 + output_papers = _annotate_with_library(papers, doi_map) 814 + else: 815 + output_papers = [_format_s2_paper(p) for p in papers] 816 + 817 + click.echo( 818 + json.dumps({"count": len(output_papers), "papers": output_papers}, indent=2) 819 + ) 820 + 821 + except PaperNotFoundError: 822 + click.echo("Error: Paper not found in Semantic Scholar.", err=True) 823 + sys.exit(1) 824 + except RateLimitError: 825 + click.echo("Error: Rate limit exceeded. Please wait and try again.", err=True) 826 + sys.exit(1) 827 + except SemanticScholarError as e: 828 + click.echo(f"Error: {e!s}", err=True) 829 + sys.exit(1) 830 + except Exception as e: 831 + click.echo(f"Error: {e!s}", err=True) 832 + sys.exit(1) 833 + 834 + 835 + @main.command() 836 + @click.option( 837 + "-q", 838 + "--query", 839 + required=True, 840 + help="Search query string", 841 + ) 842 + @click.option( 843 + "--limit", 844 + type=int, 845 + default=20, 846 + help="Maximum number of results to return (default: 20, max: 100)", 847 + ) 848 + @click.option( 849 + "--year", 850 + help="Year filter (e.g., '2020', '2018-2022', '2020-')", 851 + ) 852 + @click.option( 853 + "--open-access/--no-open-access", 854 + default=False, 855 + help="Only return open access papers (default: False)", 856 + ) 857 + @click.option( 858 + "--sort", 859 + type=click.Choice(["citations", "year"], case_sensitive=False), 860 + help="Sort results by citation count or year (descending)", 861 + ) 862 + @click.option( 863 + "--min-citations", 864 + type=int, 865 + default=0, 866 + help="Minimum citation count filter (default: 0)", 867 + ) 868 + @click.option( 869 + "--check-library/--no-check-library", 870 + default=True, 871 + help="Check if papers exist in local Zotero (default: True)", 872 + ) 873 + @click.pass_context 874 + def s2search(ctx, query, limit, year, open_access, sort, min_citations, check_library): 875 + """Search for papers on Semantic Scholar. 876 + 877 + Search across Semantic Scholar's index of over 200M papers. 878 + 879 + Examples: 880 + pyzotero s2search -q "climate adaptation" 881 + 882 + pyzotero s2search -q "machine learning" --year 2020-2024 883 + 884 + pyzotero s2search -q "neural networks" --open-access --limit 50 885 + 886 + pyzotero s2search -q "deep learning" --sort citations --min-citations 100 887 + 888 + """ 889 + try: 890 + # Search Semantic Scholar 891 + click.echo(f'Searching Semantic Scholar for: "{query}"...', err=True) 892 + result = search_papers( 893 + query, 894 + limit=limit, 895 + year=year, 896 + open_access_only=open_access, 897 + sort=sort, 898 + min_citations=min_citations, 899 + ) 900 + papers = result.get("papers", []) 901 + total = result.get("total", len(papers)) 902 + 903 + if not papers: 904 + click.echo(json.dumps({"count": 0, "total": total, "papers": []})) 905 + return 906 + 907 + # Optionally annotate with library status 908 + if check_library: 909 + click.echo("Checking local Zotero library...", err=True) 910 + locale = ctx.obj.get("locale", "en-US") 911 + zot = _get_zotero_client(locale) 912 + doi_map = _build_doi_index(zot) 913 + output_papers = _annotate_with_library(papers, doi_map) 914 + else: 915 + output_papers = [_format_s2_paper(p) for p in papers] 916 + 917 + click.echo( 918 + json.dumps( 919 + {"count": len(output_papers), "total": total, "papers": output_papers}, 920 + indent=2, 921 + ) 922 + ) 923 + 924 + except RateLimitError: 925 + click.echo("Error: Rate limit exceeded. Please wait and try again.", err=True) 926 + sys.exit(1) 927 + except SemanticScholarError as e: 928 + click.echo(f"Error: {e!s}", err=True) 929 + sys.exit(1) 511 930 except Exception as e: 512 931 click.echo(f"Error: {e!s}", err=True) 513 932 sys.exit(1)

+441

src/pyzotero/semantic_scholar.py

··· 1 + """Semantic Scholar API client for pyzotero. 2 + 3 + This module provides functions to interact with the Semantic Scholar Graph API 4 + for fetching paper metadata, citations, references, and recommendations. 5 + 6 + API Documentation: https://api.semanticscholar.org/api-docs 7 + """ 8 + 9 + import httpx 10 + from httpx import codes as http 11 + 12 + BASE_URL = "https://api.semanticscholar.org/graph/v1" 13 + RECOMMENDATIONS_URL = "https://api.semanticscholar.org/recommendations/v1" 14 + 15 + # Fields to request from the Semantic Scholar API 16 + DEFAULT_FIELDS = [ 17 + "paperId", 18 + "externalIds", 19 + "title", 20 + "abstract", 21 + "venue", 22 + "year", 23 + "referenceCount", 24 + "citationCount", 25 + "influentialCitationCount", 26 + "isOpenAccess", 27 + "openAccessPdf", 28 + "authors", 29 + "publicationTypes", 30 + "publicationDate", 31 + ] 32 + 33 + # Timeout for API requests (seconds) 34 + REQUEST_TIMEOUT = 30.0 35 + 36 + 37 + class SemanticScholarError(Exception): 38 + """Base exception for Semantic Scholar API errors.""" 39 + 40 + 41 + class RateLimitError(SemanticScholarError): 42 + """Raised when API rate limit is exceeded.""" 43 + 44 + def __init__(self, msg="Rate limit exceeded. Please wait and try again."): 45 + super().__init__(msg) 46 + 47 + 48 + class PaperNotFoundError(SemanticScholarError): 49 + """Raised when a paper is not found.""" 50 + 51 + def __init__(self, msg="Paper not found."): 52 + super().__init__(msg) 53 + 54 + 55 + def _make_request(url, params=None): 56 + """Make an HTTP GET request to the Semantic Scholar API. 57 + 58 + Args: 59 + url: The full URL to request 60 + params: Optional dict of query parameters 61 + 62 + Returns: 63 + The JSON response as a dict 64 + 65 + Raises: 66 + RateLimitError: If rate limit is exceeded (HTTP 429) 67 + PaperNotFoundError: If paper is not found (HTTP 404) 68 + SemanticScholarError: For other API errors 69 + 70 + """ 71 + with httpx.Client(timeout=REQUEST_TIMEOUT) as client: 72 + response = client.get(url, params=params) 73 + 74 + _check_response(response) 75 + return response.json() 76 + 77 + 78 + def _check_response(response): 79 + """Check HTTP response and raise appropriate exceptions. 80 + 81 + Args: 82 + response: httpx Response object 83 + 84 + Raises: 85 + RateLimitError: If rate limit is exceeded (HTTP 429) 86 + PaperNotFoundError: If paper is not found (HTTP 404) 87 + SemanticScholarError: For other API errors 88 + 89 + """ 90 + if response.status_code == http.TOO_MANY_REQUESTS: 91 + raise RateLimitError 92 + 93 + if response.status_code == http.NOT_FOUND: 94 + raise PaperNotFoundError 95 + 96 + if response.status_code != http.OK: 97 + msg = f"Semantic Scholar API error: {response.status_code} - {response.text}" 98 + raise SemanticScholarError(msg) 99 + 100 + 101 + def _format_paper_id(identifier, id_type=None): # noqa: PLR0911 102 + """Format a paper identifier for the Semantic Scholar API. 103 + 104 + Semantic Scholar accepts various identifier formats: 105 + - DOI: DOI:10.1234/example 106 + - arXiv: ARXIV:1234.5678 107 + - Semantic Scholar ID: direct use 108 + - PMID: PMID:12345678 109 + - MAG: MAG:12345678 110 + - ACL: ACL:P19-1234 111 + - CorpusID: CorpusId:12345678 112 + 113 + Args: 114 + identifier: The paper identifier 115 + id_type: Optional type hint ("doi", "arxiv", "pmid", "mag", "acl", "corpus") 116 + 117 + Returns: 118 + Formatted identifier string for the API 119 + 120 + """ 121 + if not identifier: 122 + return identifier 123 + 124 + identifier = identifier.strip() 125 + 126 + # If already prefixed, return as-is 127 + known_prefixes = ["DOI:", "ARXIV:", "PMID:", "MAG:", "ACL:", "CorpusId:"] 128 + for prefix in known_prefixes: 129 + if identifier.upper().startswith(prefix.upper()): 130 + return identifier 131 + 132 + # Strip common DOI URL prefixes 133 + doi_prefixes = ["https://doi.org/", "http://doi.org/", "doi:"] 134 + for prefix in doi_prefixes: 135 + if identifier.lower().startswith(prefix.lower()): 136 + identifier = identifier[len(prefix) :] 137 + return f"DOI:{identifier}" 138 + 139 + # If type hint provided, add appropriate prefix 140 + if id_type: 141 + type_map = { 142 + "doi": "DOI:", 143 + "arxiv": "ARXIV:", 144 + "pmid": "PMID:", 145 + "mag": "MAG:", 146 + "acl": "ACL:", 147 + "corpus": "CorpusId:", 148 + } 149 + prefix = type_map.get(id_type.lower()) 150 + if prefix: 151 + return f"{prefix}{identifier}" 152 + 153 + # Heuristic detection 154 + # DOIs typically contain a slash and start with 10. 155 + if "/" in identifier and identifier.startswith("10."): 156 + return f"DOI:{identifier}" 157 + 158 + # arXiv IDs have a specific format (YYMM.NNNNN or category/YYMMNNN) 159 + if "." in identifier and identifier.split(".")[0].isdigit(): 160 + return f"ARXIV:{identifier}" 161 + 162 + # If all else fails, assume it's a Semantic Scholar ID 163 + return identifier 164 + 165 + 166 + def _normalise_paper(paper_data): 167 + """Normalise paper data from Semantic Scholar to a consistent format. 168 + 169 + Args: 170 + paper_data: Raw paper data from the API 171 + 172 + Returns: 173 + Normalised paper dict with consistent field names 174 + 175 + """ 176 + if not paper_data: 177 + return None 178 + 179 + external_ids = paper_data.get("externalIds") or {} 180 + authors = paper_data.get("authors") or [] 181 + open_access_pdf = paper_data.get("openAccessPdf") or {} 182 + 183 + return { 184 + "paperId": paper_data.get("paperId"), 185 + "doi": external_ids.get("DOI"), 186 + "arxivId": external_ids.get("ArXiv"), 187 + "pmid": external_ids.get("PubMed"), 188 + "title": paper_data.get("title"), 189 + "abstract": paper_data.get("abstract"), 190 + "venue": paper_data.get("venue"), 191 + "year": paper_data.get("year"), 192 + "authors": [ 193 + { 194 + "authorId": a.get("authorId"), 195 + "name": a.get("name"), 196 + } 197 + for a in authors 198 + ], 199 + "citationCount": paper_data.get("citationCount"), 200 + "referenceCount": paper_data.get("referenceCount"), 201 + "influentialCitationCount": paper_data.get("influentialCitationCount"), 202 + "isOpenAccess": paper_data.get("isOpenAccess"), 203 + "openAccessPdfUrl": open_access_pdf.get("url"), 204 + "publicationTypes": paper_data.get("publicationTypes"), 205 + "publicationDate": paper_data.get("publicationDate"), 206 + } 207 + 208 + 209 + def get_paper(identifier, id_type=None): 210 + """Get details for a single paper. 211 + 212 + Args: 213 + identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.) 214 + id_type: Optional type hint for the identifier 215 + 216 + Returns: 217 + Normalised paper dict 218 + 219 + Raises: 220 + PaperNotFoundError: If paper is not found 221 + SemanticScholarError: For API errors 222 + 223 + """ 224 + paper_id = _format_paper_id(identifier, id_type) 225 + url = f"{BASE_URL}/paper/{paper_id}" 226 + params = {"fields": ",".join(DEFAULT_FIELDS)} 227 + 228 + data = _make_request(url, params) 229 + return _normalise_paper(data) 230 + 231 + 232 + def get_citations(identifier, id_type=None, limit=100, offset=0): 233 + """Get papers that cite a given paper. 234 + 235 + Args: 236 + identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.) 237 + id_type: Optional type hint for the identifier 238 + limit: Maximum number of results (default 100, max 1000) 239 + offset: Offset for pagination 240 + 241 + Returns: 242 + Dict with 'total' count and 'papers' list 243 + 244 + Raises: 245 + PaperNotFoundError: If paper is not found 246 + SemanticScholarError: For API errors 247 + 248 + """ 249 + paper_id = _format_paper_id(identifier, id_type) 250 + url = f"{BASE_URL}/paper/{paper_id}/citations" 251 + params = { 252 + "fields": ",".join(DEFAULT_FIELDS), 253 + "limit": min(limit, 1000), 254 + "offset": offset, 255 + } 256 + 257 + data = _make_request(url, params) 258 + 259 + # Citations API returns {"data": [...], "offset": N, "next": N} 260 + papers = [] 261 + for item in data.get("data", []): 262 + citing_paper = item.get("citingPaper") 263 + if citing_paper: 264 + papers.append(_normalise_paper(citing_paper)) 265 + 266 + return { 267 + "total": len(papers), 268 + "offset": data.get("offset", 0), 269 + "papers": papers, 270 + } 271 + 272 + 273 + def get_references(identifier, id_type=None, limit=100, offset=0): 274 + """Get papers that a given paper references. 275 + 276 + Args: 277 + identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.) 278 + id_type: Optional type hint for the identifier 279 + limit: Maximum number of results (default 100, max 1000) 280 + offset: Offset for pagination 281 + 282 + Returns: 283 + Dict with 'total' count and 'papers' list 284 + 285 + Raises: 286 + PaperNotFoundError: If paper is not found 287 + SemanticScholarError: For API errors 288 + 289 + """ 290 + paper_id = _format_paper_id(identifier, id_type) 291 + url = f"{BASE_URL}/paper/{paper_id}/references" 292 + params = { 293 + "fields": ",".join(DEFAULT_FIELDS), 294 + "limit": min(limit, 1000), 295 + "offset": offset, 296 + } 297 + 298 + data = _make_request(url, params) 299 + 300 + # References API returns {"data": [...], "offset": N, "next": N} 301 + papers = [] 302 + for item in data.get("data", []): 303 + cited_paper = item.get("citedPaper") 304 + if cited_paper: 305 + papers.append(_normalise_paper(cited_paper)) 306 + 307 + return { 308 + "total": len(papers), 309 + "offset": data.get("offset", 0), 310 + "papers": papers, 311 + } 312 + 313 + 314 + def get_recommendations(identifier, id_type=None, limit=100): 315 + """Get recommended papers based on a seed paper. 316 + 317 + Uses Semantic Scholar's recommendation API which returns papers 318 + similar to the input based on SPECTER2 embeddings. 319 + 320 + Args: 321 + identifier: Paper identifier (DOI, arXiv ID, S2 ID, etc.) 322 + id_type: Optional type hint for the identifier 323 + limit: Maximum number of recommendations (default 100, max 500) 324 + 325 + Returns: 326 + Dict with 'papers' list of recommended papers 327 + 328 + Raises: 329 + PaperNotFoundError: If paper is not found 330 + SemanticScholarError: For API errors 331 + 332 + """ 333 + # First, get the paper to obtain its Semantic Scholar ID 334 + paper = get_paper(identifier, id_type) 335 + paper_id = paper.get("paperId") 336 + 337 + if not paper_id: 338 + raise PaperNotFoundError 339 + 340 + url = f"{RECOMMENDATIONS_URL}/papers" 341 + params = { 342 + "fields": ",".join(DEFAULT_FIELDS), 343 + "limit": min(limit, 500), 344 + } 345 + 346 + # POST request with paper IDs in body 347 + with httpx.Client(timeout=REQUEST_TIMEOUT) as client: 348 + response = client.post( 349 + url, 350 + params=params, 351 + json={"positivePaperIds": [paper_id]}, 352 + ) 353 + _check_response(response) 354 + data = response.json() 355 + 356 + papers = [_normalise_paper(p) for p in data.get("recommendedPapers", [])] 357 + 358 + return {"papers": papers} 359 + 360 + 361 + def search_papers( 362 + query, 363 + limit=100, 364 + offset=0, 365 + year=None, 366 + open_access_only=False, 367 + sort=None, 368 + min_citations=None, 369 + ): 370 + """Search for papers by keyword query. 371 + 372 + Args: 373 + query: Search query string 374 + limit: Maximum number of results (default 100, max 100) 375 + offset: Offset for pagination 376 + year: Optional year filter (e.g., "2020", "2018-2022", "2020-") 377 + open_access_only: If True, only return open access papers 378 + sort: Sort order - "citationCount" (descending) or "year" (descending) 379 + min_citations: Minimum citation count filter (applied client-side) 380 + 381 + Returns: 382 + Dict with 'total' count, 'offset', and 'papers' list 383 + 384 + Raises: 385 + SemanticScholarError: For API errors 386 + 387 + """ 388 + url = f"{BASE_URL}/paper/search" 389 + params = { 390 + "query": query, 391 + "fields": ",".join(DEFAULT_FIELDS), 392 + "limit": min(limit, 100), # API max is 100 per request 393 + "offset": offset, 394 + } 395 + 396 + if year: 397 + params["year"] = year 398 + 399 + if open_access_only: 400 + params["openAccessPdf"] = "" 401 + 402 + if sort: 403 + # Semantic Scholar supports sorting by citationCount:desc or publicationDate:desc 404 + sort_map = { 405 + "citationCount": "citationCount:desc", 406 + "citations": "citationCount:desc", 407 + "year": "publicationDate:desc", 408 + "date": "publicationDate:desc", 409 + } 410 + if sort in sort_map: 411 + params["sort"] = sort_map[sort] 412 + 413 + data = _make_request(url, params) 414 + 415 + papers = [_normalise_paper(p) for p in data.get("data", [])] 416 + 417 + # Apply client-side citation filter if specified 418 + if min_citations is not None and min_citations > 0: 419 + papers = [p for p in papers if (p.get("citationCount") or 0) >= min_citations] 420 + 421 + return { 422 + "total": data.get("total", len(papers)), 423 + "offset": data.get("offset", 0), 424 + "papers": papers, 425 + } 426 + 427 + 428 + def filter_by_citations(papers, min_citations): 429 + """Filter a list of papers by minimum citation count. 430 + 431 + Args: 432 + papers: List of normalised paper dicts 433 + min_citations: Minimum citation count 434 + 435 + Returns: 436 + Filtered list of papers 437 + 438 + """ 439 + if min_citations is None or min_citations <= 0: 440 + return papers 441 + return [p for p in papers if (p.get("citationCount") or 0) >= min_citations]