# SPDX-License-Identifier: AGPL-3.0-or-later
"""Google Scholar is a freely accessible web search engine that indexes the full
text or metadata of scholarly literature across an array of publishing formats
and disciplines.
Compared to other Google services the Scholar engine has a simple GET REST-API
and there does not exists ``async`` API. Even though the API slightly vintage
we can make use of the :ref:`google API` to assemble the arguments of the GET
request.
Configuration
=============
.. code:: yaml
- name: google scholar
engine: google_scholar
shortcut: gos
Implementations
===============
"""
import typing as t
from urllib.parse import urlencode
from datetime import datetime
from lxml import html
import httpx
from searx.utils import (
eval_xpath,
eval_xpath_getindex,
eval_xpath_list,
extract_text,
ElementType,
)
from searx.exceptions import SearxEngineCaptchaException, SearxEngineAccessDeniedException
from searx.engines.google import fetch_traits # pylint: disable=unused-import
from searx.engines.google import (
get_google_info,
time_range_dict,
)
from searx.result_types import EngineResults
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": "https://scholar.google.com",
"wikidata_id": "Q494817",
"official_api_documentation": "https://developers.google.com/custom-search",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
# engine dependent config
categories = ["science", "scientific publications"]
paging = True
max_page = 50
"""`Google max 50 pages`_
.. _Google max 50 pages: https://github.com/searxng/searxng/issues/2982
"""
language_support = True
time_range_support = True
safesearch = False
send_accept_language_header = True
[docs]
def request(query: str, params: "OnlineParams") -> None:
"""Google-Scholar search request"""
google_info = get_google_info(params, traits)
# subdomain is: scholar.google.xy
google_info["subdomain"] = google_info["subdomain"].replace("www.", "scholar.")
args = {
"q": query,
**google_info["params"],
"start": (params["pageno"] - 1) * 10,
"as_sdt": "2007", # include patents / to disable set "0,5"
"as_vis": "0", # include citations / to disable set "1"
}
args.update(time_range_args(params))
params["url"] = "https://" + google_info["subdomain"] + "/scholar?" + urlencode(args)
params["cookies"] = google_info["cookies"]
params["headers"].update(google_info["headers"])
[docs]
def response(resp: "SXNG_Response") -> EngineResults: # pylint: disable=too-many-locals
"""Parse response from Google Scholar"""
if resp.status_code in (301, 302, 303, 307, 308) and "Location" in resp.headers:
if "/sorry/index?continue" in resp.headers["Location"]:
# Our systems have detected unusual traffic from your computer
# network. Please try again later.
raise SearxEngineAccessDeniedException(
message="google_scholar: unusual traffic detected",
)
raise httpx.TooManyRedirects(f"location {resp.headers['Location'].split('?')[0]}")
res = EngineResults()
dom = html.fromstring(resp.text)
detect_google_captcha(dom)
# parse results
for result in eval_xpath_list(dom, "//div[@data-rp]"):
title = extract_text(eval_xpath(result, ".//h3[1]//a"))
if not title:
# this is a [ZITATION] block
continue
pub_type: str = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']")) or ""
if pub_type:
pub_type = pub_type[1:-1].lower()
url: str = eval_xpath_getindex(result, ".//h3[1]//a/@href", 0)
content: str = extract_text(eval_xpath(result, ".//div[@class='gs_rs']")) or ""
authors, journal, publisher, publishedDate = parse_gs_a(
extract_text(eval_xpath(result, ".//div[@class='gs_a']"))
)
if publisher in url:
publisher = ""
# cited by
comments: str = (
extract_text(eval_xpath(result, ".//div[@class='gs_fl']/a[starts-with(@href,'/scholar?cites=')]")) or ""
)
# link to the html or pdf document
html_url: str = ""
pdf_url: str = ""
doc_url = eval_xpath_getindex(result, ".//div[@class='gs_or_ggsm']/a/@href", 0, default=None)
doc_type = extract_text(eval_xpath(result, ".//span[@class='gs_ctg2']"))
if doc_type == "[PDF]":
pdf_url = doc_url
else:
html_url = doc_url
res.add(
res.types.Paper(
type=pub_type,
url=url,
title=title,
authors=authors,
publisher=publisher,
journal=journal,
publishedDate=publishedDate,
content=content,
comments=comments,
html_url=html_url,
pdf_url=pdf_url,
)
)
# parse suggestion
for suggestion in eval_xpath(dom, "//div[contains(@class, 'gs_qsuggest_wrap')]//li//a"):
res.add(res.types.LegacyResult(suggestion=extract_text(suggestion)))
for correction in eval_xpath(dom, "//div[@class='gs_r gs_pda']/a"):
res.add(res.types.LegacyResult(correction=extract_text(correction)))
return res
[docs]
def time_range_args(params: "OnlineParams") -> dict[str, int]:
"""Returns a dictionary with a time range arguments based on
``params["time_range"]``.
Google Scholar supports a detailed search by year. Searching by *last
month* or *last week* (as offered by SearXNG) is uncommon for scientific
publications and is not supported by Google Scholar.
To limit the result list when the users selects a range, all the SearXNG
ranges (*day*, *week*, *month*, *year*) are mapped to *year*. If no range
is set an empty dictionary of arguments is returned.
Example; when user selects a time range and we find ourselves in the year
2025 (current year minus one):
.. code:: python
{ "as_ylo" : 2024 }
"""
ret_val: dict[str, int] = {}
if params["time_range"] in time_range_dict:
ret_val["as_ylo"] = datetime.now().year - 1
return ret_val
[docs]
def detect_google_captcha(dom: ElementType):
"""In case of CAPTCHA Google Scholar open its own *not a Robot* dialog and is
not redirected to ``sorry.google.com``.
"""
if eval_xpath(dom, "//form[@id='gs_captcha_f']"):
raise SearxEngineCaptchaException(message="CAPTCHA (gs_captcha_f)")
[docs]
def parse_gs_a(text: str | None) -> tuple[list[str], str, str, datetime | None]:
"""Parse the text written in green.
Possible formats:
* "{authors} - {journal}, {year} - {publisher}"
* "{authors} - {year} - {publisher}"
* "{authors} - {publisher}"
"""
if text is None or text == "":
return [], "", "", None
s_text = text.split(" - ")
authors: list[str] = s_text[0].split(", ")
publisher: str = s_text[-1]
if len(s_text) != 3:
return authors, "", publisher, None
# the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
# get journal and year
journal_year = s_text[1].split(", ")
# journal is optional and may contains some coma
if len(journal_year) > 1:
journal: str = ", ".join(journal_year[0:-1])
if journal == "…":
journal = ""
else:
journal = ""
# year
year = journal_year[-1]
try:
publishedDate = datetime.strptime(year.strip(), "%Y")
except ValueError:
publishedDate = None
return authors, journal, publisher, publishedDate