# SPDX-License-Identifier: AGPL-3.0-or-later
"""Processor used for ``online`` engines."""
__all__ = ["OnlineProcessor", "OnlineParams"]
import typing as t
from timeit import default_timer
import asyncio
import ssl
import httpx
import searx.network
from searx.utils import gen_useragent
from searx.exceptions import (
SearxEngineAccessDeniedException,
SearxEngineCaptchaException,
SearxEngineTooManyRequestsException,
)
from searx.metrics.error_recorder import count_error
from .abstract import EngineProcessor, RequestParams
if t.TYPE_CHECKING:
from searx.search.models import SearchQuery
from searx.results import ResultContainer
from searx.result_types import EngineResults
class HTTPParams(t.TypedDict):
"""HTTP request parameters"""
method: t.Literal["GET", "POST"]
"""HTTP request method."""
headers: dict[str, str]
"""HTTP header information."""
data: dict[str, str]
"""Sending `form encoded data`_.
.. _form encoded data:
https://www.python-httpx.org/quickstart/#sending-form-encoded-data
"""
json: dict[str, t.Any]
"""`Sending `JSON encoded data`_.
.. _JSON encoded data:
https://www.python-httpx.org/quickstart/#sending-json-encoded-data
"""
content: bytes
"""`Sending `binary request data`_.
.. _binary request data:
https://www.python-httpx.org/quickstart/#sending-json-encoded-data
"""
url: str
"""Requested url."""
cookies: dict[str, str]
"""HTTP cookies."""
allow_redirects: bool
"""Follow redirects"""
max_redirects: int
"""Maximum redirects, hard limit."""
soft_max_redirects: int
"""Maximum redirects, soft limit. Record an error but don't stop the engine."""
verify: None | t.Literal[False] | str # not sure str really works
"""If not ``None``, it overrides the verify value defined in the network. Use
``False`` to accept any server certificate and use a path to file to specify a
server certificate"""
auth: str | None
"""An authentication to use when sending requests."""
raise_for_httperror: bool
"""Raise an exception if the `HTTP response status code`_ is ``>= 300``.
.. _HTTP response status code:
https://developer.mozilla.org/en-US/docs/Web/HTTP/Reference/Status
"""
[docs]
class OnlineParams(HTTPParams, RequestParams):
"""Request parameters of a ``online`` engine."""
def default_request_params() -> HTTPParams:
"""Default request parameters for ``online`` engines."""
return {
"method": "GET",
"headers": {},
"data": {},
"json": {},
"content": b"",
"url": "",
"cookies": {},
"allow_redirects": False,
"max_redirects": 0,
"soft_max_redirects": 0,
"auth": None,
"verify": None,
"raise_for_httperror": True,
}
[docs]
class OnlineProcessor(EngineProcessor):
"""Processor class for ``online`` engines."""
engine_type: str = "online"
[docs]
def init_engine(self) -> bool:
"""This method is called in a thread, and before the base method is
called, the network must be set up for the ``online`` engines."""
self.init_network_in_thread(start_time=default_timer(), timeout_limit=self.engine.timeout)
return super().init_engine()
def init_network_in_thread(self, start_time: float, timeout_limit: float):
# set timeout for all HTTP requests
searx.network.set_timeout_for_thread(timeout_limit, start_time=start_time)
# reset the HTTP total time
searx.network.reset_time_for_thread()
# set the network
searx.network.set_context_network_name(self.engine.name)
[docs]
def get_params(self, search_query: "SearchQuery", engine_category: str) -> OnlineParams | None:
"""Returns a dictionary with the :ref:`request params <engine request
online>` (:py:obj:`OnlineParams`), if the search condition is not
supported by the engine, ``None`` is returned."""
base_params: RequestParams | None = super().get_params(search_query, engine_category)
if base_params is None:
return base_params
params: OnlineParams = {**default_request_params(), **base_params}
headers = params["headers"]
# add an user agent
headers["User-Agent"] = gen_useragent()
# add Accept-Language header
if self.engine.send_accept_language_header and search_query.locale:
ac_lang = search_query.locale.language
if search_query.locale.territory:
ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
search_query.locale.language,
search_query.locale.territory,
search_query.locale.language,
)
headers["Accept-Language"] = ac_lang
self.logger.debug("HTTP Accept-Language: %s", headers.get("Accept-Language", ""))
return params
def _send_http_request(self, params: OnlineParams):
# create dictionary which contain all information about the request
request_args: dict[str, t.Any] = {
"headers": params["headers"],
"cookies": params["cookies"],
"auth": params["auth"],
}
verify = params.get("verify")
if verify is not None:
request_args["verify"] = verify
# max_redirects
max_redirects = params.get("max_redirects")
if max_redirects:
request_args["max_redirects"] = max_redirects
# allow_redirects
if "allow_redirects" in params:
request_args["allow_redirects"] = params["allow_redirects"]
# soft_max_redirects
soft_max_redirects: int = params.get("soft_max_redirects", max_redirects or 0)
# raise_for_status
request_args["raise_for_httperror"] = params.get("raise_for_httperror", True)
# specific type of request (GET or POST)
if params["method"] == "GET":
req = searx.network.get
else:
req = searx.network.post
if params["data"]:
request_args["data"] = params["data"]
if params["json"]:
request_args["json"] = params["json"]
if params["content"]:
request_args["content"] = params["content"]
# send the request
response = req(params["url"], **request_args)
# check soft limit of the redirect count
if len(response.history) > soft_max_redirects:
# unexpected redirect : record an error
# but the engine might still return valid results.
status_code = str(response.status_code or "")
reason = response.reason_phrase or ""
hostname = response.url.host
count_error(
self.engine.name,
"{} redirects, maximum: {}".format(len(response.history), soft_max_redirects),
(status_code, reason, hostname),
secondary=True,
)
return response
def _search_basic(self, query: str, params: OnlineParams) -> "EngineResults|None":
# update request parameters dependent on
# search-engine (contained in engines folder)
self.engine.request(query, params)
# ignoring empty urls
if not params["url"]:
return None
# send request
response = self._send_http_request(params)
# parse the response
response.search_params = params
return self.engine.response(response)
def search( # pyright: ignore[reportIncompatibleMethodOverride]
self,
query: str,
params: OnlineParams,
result_container: "ResultContainer",
start_time: float,
timeout_limit: float,
):
self.init_network_in_thread(start_time, timeout_limit)
try:
# send requests and parse the results
search_results = self._search_basic(query, params)
self.extend_container(result_container, start_time, search_results)
except ssl.SSLError as e:
# requests timeout (connect or read)
self.handle_exception(result_container, e, suspend=True)
self.logger.error("SSLError {}, verify={}".format(e, searx.network.get_network(self.engine.name).verify))
except (httpx.TimeoutException, asyncio.TimeoutError) as e:
# requests timeout (connect or read)
self.handle_exception(result_container, e, suspend=True)
self.logger.error(
"HTTP requests timeout (search duration : {0} s, timeout: {1} s) : {2}".format(
default_timer() - start_time, timeout_limit, e.__class__.__name__
)
)
except (httpx.HTTPError, httpx.StreamError) as e:
# other requests exception
self.handle_exception(result_container, e, suspend=True)
self.logger.exception(
"requests exception (search duration : {0} s, timeout: {1} s) : {2}".format(
default_timer() - start_time, timeout_limit, e
)
)
except (
SearxEngineCaptchaException,
SearxEngineTooManyRequestsException,
SearxEngineAccessDeniedException,
) as e:
self.handle_exception(result_container, e, suspend=True)
self.logger.exception(e.message)
except Exception as e: # pylint: disable=broad-except
self.handle_exception(result_container, e)
self.logger.exception("exception : {0}".format(e))