Source code for searx.engines.core
# SPDX-License-Identifier: AGPL-3.0-or-later
"""CORE_ (COnnecting REpositories) provides a comprehensive bibliographic
database of the world’s scholarly literature, collecting and indexing
research from repositories and journals.
.. _CORE: https://core.ac.uk/about
.. note::
The CORE engine requires an :py:obj:`API key <api_key>`.
.. _core engine config:
Configuration
=============
The engine has the following additional settings:
- :py:obj:`api_key`
.. code:: yaml
- name: core.ac.uk
api_key: "..."
inactive: false
Implementations
===============
"""
import typing as t
from datetime import datetime
from urllib.parse import urlencode
from searx.result_types import EngineResults
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about = {
"website": "https://core.ac.uk",
"wikidata_id": "Q22661180",
"official_api_documentation": "https://api.core.ac.uk/docs/v3",
"use_official_api": True,
"require_api_key": True,
"results": "JSON",
}
api_key = ""
"""For an API key register at https://core.ac.uk/services/api and insert
the API key in the engine :ref:`core engine config`."""
categories = ["science", "scientific publications"]
paging = True
nb_per_page = 10
base_url = "https://api.core.ac.uk/v3/search/works/"
[docs]
def setup(engine_settings: dict[str, t.Any]) -> bool:
"""Initialization of the CORE_ engine, checks whether the :py:obj:`api_key`
is set, otherwise the engine is inactive.
"""
key: str = engine_settings.get("api_key", "")
if key and key not in ("unset", "unknown", "..."):
return True
logger.error("CORE's API key is not set or invalid.")
return False
def request(query: str, params: "OnlineParams") -> None:
# API v3 uses different parameters
search_params = {
"q": query,
"offset": (params["pageno"] - 1) * nb_per_page,
"limit": nb_per_page,
"sort": "relevance",
}
params["url"] = base_url + "?" + urlencode(search_params)
params["headers"] = {"Authorization": f"Bearer {api_key}"}
def response(resp: "SXNG_Response") -> EngineResults:
# pylint: disable=too-many-branches
res = EngineResults()
json_data = resp.json()
for result in json_data.get("results", []):
# Get title
if not result.get("title"):
continue
# Get URL - try different options
url: str | None = None
# Try DOI first
doi: str = result.get("doi")
if doi:
url = f"https://doi.org/{doi}"
if url is None and result.get("doi"):
# use the DOI reference
url = "https://doi.org/" + str(result["doi"])
elif result.get("id"):
url = "https://core.ac.uk/works/" + str(result["id"])
elif result.get("downloadUrl"):
url = result["downloadUrl"]
elif result.get("sourceFulltextUrls"):
url = result["sourceFulltextUrls"]
else:
continue
# Published date
published_date = None
raw_date = result.get("publishedDate") or result.get("depositedDate")
if raw_date:
try:
published_date = datetime.fromisoformat(result["publishedDate"].replace("Z", "+00:00"))
except (ValueError, AttributeError):
pass
# Handle journals
journals = []
if result.get("journals"):
journals = [j.get("title") for j in result["journals"] if j.get("title")]
# Handle publisher
publisher = result.get("publisher", "").strip("'")
# Handle authors
authors: set[str] = set()
for i in result.get("authors", []):
name: str | None = i.get("name")
if name:
authors.add(name)
res.add(
res.types.Paper(
title=result.get("title"),
url=url,
content=result.get("fullText", "") or "",
tags=result.get("fieldOfStudy", []),
publishedDate=published_date,
type=result.get("documentType", "") or "",
authors=authors,
editor=", ".join(result.get("contributors", [])),
publisher=publisher,
journal=", ".join(journals),
doi=result.get("doi"),
pdf_url=result.get("downloadUrl", {}) or result.get("sourceFulltextUrls", {}),
)
)
return res