Source code for searx.utils

# SPDX-License-Identifier: AGPL-3.0-or-later
"""Utility functions for the engines"""


import re
import importlib
import importlib.util
import json
import types

import typing as t
from collections.abc import MutableMapping, Callable

from numbers import Number
from os.path import splitext, join
from random import choice, randint
from html.parser import HTMLParser
from html import escape
from urllib.parse import urljoin, urlparse, parse_qs, urlencode
from datetime import timedelta
from markdown_it import MarkdownIt

from lxml import html
from lxml.etree import XPath, XPathError, XPathSyntaxError
from lxml.etree import ElementBase, _Element  # pyright: ignore[reportPrivateUsage]

from searx import settings
from searx.data import USER_AGENTS, gsa_useragents_loader
from searx.version import VERSION_TAG
from searx.exceptions import SearxXPathSyntaxException, SearxEngineXPathException
from searx import logger

logger = logger.getChild('utils')

XPathSpecType: t.TypeAlias = str | XPath
"""Type alias used by :py:obj:`searx.utils.get_xpath`,
:py:obj:`searx.utils.eval_xpath` and other XPath selectors."""

ElementType: t.TypeAlias = ElementBase | _Element


_BLOCKED_TAGS = ('script', 'style')

_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)

_JS_STRING_DELIMITERS = re.compile(r'(["\'`])')
_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])([\$_\w][\$_\w0-9]*)(:)')
_JS_VOID_OR_UNDEFINED_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)|undefined')
_JS_DECIMAL_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]*)\.([0-9_]*)")
_JS_DECIMAL2_RE = re.compile(r"([\[\,:])\s*(\-?)\s*([0-9_]+)")
_JS_EXTRA_COMA_RE = re.compile(r"\s*,\s*([\]\}])")
_JS_STRING_ESCAPE_RE = re.compile(r'\\(.)')
_JSON_PASSTHROUGH_ESCAPES = R'"\bfnrtu'

_XPATH_CACHE: dict[str, XPath] = {}
_LANG_TO_LC_CACHE: dict[str, dict[str, str]] = {}


class _NotSetClass:  # pylint: disable=too-few-public-methods
    """Internal class for this module, do not create instance of this class.
    Replace the None value, allow explicitly pass None as a function argument"""


_NOTSET = _NotSetClass()


[docs] def searxng_useragent() -> str: """Return the SearXNG User Agent""" return f"SearXNG/{VERSION_TAG} {settings['outgoing']['useragent_suffix']}".strip()
[docs] def gen_useragent(os_string: str | None = None) -> str: """Return a random browser User Agent See searx/data/useragents.json """ return USER_AGENTS['ua'].format( os=os_string or choice(USER_AGENTS['os']), version=choice(USER_AGENTS['versions']), )
[docs] def gen_gsa_useragent() -> str: """Return a random "Android Google App" User Agent suitable for Google See searx/data/gsa_useragents.txt """ return choice(gsa_useragents_loader()) + " GoogleApp/" + str(randint(0, 9))
[docs] class HTMLTextExtractor(HTMLParser): """Internal class to extract text from HTML""" def __init__(self): HTMLParser.__init__(self) self.result: list[str] = [] self.tags: list[str] = [] def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: self.tags.append(tag) if tag == 'br': self.result.append(' ') def handle_endtag(self, tag: str) -> None: if not self.tags: return if tag != self.tags[-1]: self.result.append(f"</{tag}>") return self.tags.pop() def is_valid_tag(self): return not self.tags or self.tags[-1] not in _BLOCKED_TAGS def handle_data(self, data: str) -> None: if not self.is_valid_tag(): return self.result.append(data) def handle_charref(self, name: str) -> None: if not self.is_valid_tag(): return if name[0] in ('x', 'X'): codepoint = int(name[1:], 16) else: codepoint = int(name) self.result.append(chr(codepoint)) def handle_entityref(self, name: str) -> None: if not self.is_valid_tag(): return # codepoint = htmlentitydefs.name2codepoint[name] # self.result.append(chr(codepoint)) self.result.append(name) def get_text(self): return ''.join(self.result).strip() def error(self, message: str) -> None: # error handle is needed in <py3.10 # https://github.com/python/cpython/pull/8562/files raise AssertionError(message)
[docs] def html_to_text(html_str: str) -> str: """Extract text from a HTML string Args: * html_str (str): string HTML Returns: * str: extracted text Examples: >>> html_to_text('Example <span id="42">#2</span>') 'Example #2' >>> html_to_text('<style>.span { color: red; }</style><span>Example</span>') 'Example' >>> html_to_text(r'regexp: (?&lt;![a-zA-Z]') 'regexp: (?<![a-zA-Z]' >>> html_to_text(r'<p><b>Lorem ipsum </i>dolor sit amet</p>') 'Lorem ipsum </i>dolor sit amet</p>' >>> html_to_text(r'&#x3e &#x3c &#97') '> < a' """ if not html_str: return "" html_str = html_str.replace('\n', ' ').replace('\r', ' ') html_str = ' '.join(html_str.split()) s = HTMLTextExtractor() try: s.feed(html_str) s.close() except AssertionError: s = HTMLTextExtractor() s.feed(escape(html_str, quote=True)) s.close() return s.get_text()
[docs] def markdown_to_text(markdown_str: str) -> str: """Extract text from a Markdown string Args: * markdown_str (str): string Markdown Returns: * str: extracted text Examples: >>> markdown_to_text('[example](https://example.com)') 'example' >>> markdown_to_text('## Headline') 'Headline' """ html_str: str = ( MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str) ) return html_to_text(html_str)
[docs] def extract_text( xpath_results: list[ElementType] | ElementType | str | Number | bool | None, allow_none: bool = False, ) -> str | None: """Extract text from a lxml result - If ``xpath_results`` is a list of :py:obj:`ElementType` objects, extract the text from each result and concatenate the list in a string. - If ``xpath_results`` is a :py:obj:`ElementType` object, extract all the text node from it ( :py:obj:`lxml.html.tostring`, ``method="text"`` ) - If ``xpath_results`` is of type :py:obj:`str` or :py:obj:`Number`, :py:obj:`bool` the string value is returned. - If ``xpath_results`` is of type ``None`` a :py:obj:`ValueError` is raised, except ``allow_none`` is ``True`` where ``None`` is returned. """ if isinstance(xpath_results, list): # it's list of result : concat everything using recursive call result = '' for e in xpath_results: result = result + (extract_text(e) or '') return result.strip() if isinstance(xpath_results, ElementType): # it's a element text: str = html.tostring( # type: ignore xpath_results, # pyright: ignore[reportArgumentType] encoding='unicode', method='text', with_tail=False, ) text = text.strip().replace('\n', ' ') # type: ignore return ' '.join(text.split()) # type: ignore if isinstance(xpath_results, (str, Number, bool)): return str(xpath_results) if xpath_results is None and allow_none: return None if xpath_results is None and not allow_none: raise ValueError('extract_text(None, allow_none=False)') raise ValueError('unsupported type')
[docs] def normalize_url(url: str, base_url: str) -> str: """Normalize URL: add protocol, join URL with base_url, add trailing slash if there is no path Args: * url (str): Relative URL * base_url (str): Base URL, it must be an absolute URL. Example: >>> normalize_url('https://example.com', 'http://example.com/') 'https://example.com/' >>> normalize_url('//example.com', 'http://example.com/') 'http://example.com/' >>> normalize_url('//example.com', 'https://example.com/') 'https://example.com/' >>> normalize_url('/path?a=1', 'https://example.com') 'https://example.com/path?a=1' >>> normalize_url('', 'https://example.com') 'https://example.com/' >>> normalize_url('/test', '/path') raise ValueError Raises: * lxml.etree.ParserError Returns: * str: normalized URL """ if url.startswith('//'): # add http or https to this kind of url //example.com/ parsed_search_url = urlparse(base_url) url = '{0}:{1}'.format(parsed_search_url.scheme or 'http', url) elif url.startswith('/'): # fix relative url to the search engine url = urljoin(base_url, url) # fix relative urls that fall through the crack if '://' not in url: url = urljoin(base_url, url) parsed_url = urlparse(url) # add a / at this end of the url if there is no path if not parsed_url.netloc: raise ValueError('Cannot parse url') if not parsed_url.path: url += '/' return url
[docs] def extract_url(xpath_results: list[ElementType] | ElementType | str | Number | bool | None, base_url: str) -> str: """Extract and normalize URL from lxml Element Example: >>> def f(s, search_url): >>> return searx.utils.extract_url(html.fromstring(s), search_url) >>> f('<span id="42">https://example.com</span>', 'http://example.com/') 'https://example.com/' >>> f('https://example.com', 'http://example.com/') 'https://example.com/' >>> f('//example.com', 'http://example.com/') 'http://example.com/' >>> f('//example.com', 'https://example.com/') 'https://example.com/' >>> f('/path?a=1', 'https://example.com') 'https://example.com/path?a=1' >>> f('', 'https://example.com') raise lxml.etree.ParserError >>> searx.utils.extract_url([], 'https://example.com') raise ValueError Raises: * ValueError * lxml.etree.ParserError Returns: * str: normalized URL """ if xpath_results == []: raise ValueError('Empty url resultset') url = extract_text(xpath_results) if url: return normalize_url(url, base_url) raise ValueError('URL not found')
[docs] def dict_subset(dictionary: MutableMapping[t.Any, t.Any], properties: set[str]) -> MutableMapping[str, t.Any]: """Extract a subset of a dict Examples: >>> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'C']) {'A': 'a', 'C': 'c'} >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D']) {'A': 'a'} """ return {k: dictionary[k] for k in properties if k in dictionary}
[docs] def humanize_bytes(size: int | float, precision: int = 2): """Determine the *human readable* value of bytes on 1024 base (1KB=1024B).""" s = ['B ', 'KB', 'MB', 'GB', 'TB'] x = len(s) p = 0 while size > 1024 and p < x: p += 1 size = size / 1024.0 return "%.*f %s" % (precision, size, s[p])
[docs] def humanize_number(size: int | float, precision: int = 0): """Determine the *human readable* value of a decimal number.""" s = ['', 'K', 'M', 'B', 'T'] x = len(s) p = 0 while size > 1000 and p < x: p += 1 size = size / 1000.0 return "%.*f%s" % (precision, size, s[p])
[docs] def convert_str_to_int(number_str: str) -> int: """Convert number_str to int or 0 if number_str is not a number.""" if number_str.isdigit(): return int(number_str) return 0
[docs] def extr(txt: str, begin: str, end: str, default: str = "") -> str: """Extract the string between ``begin`` and ``end`` from ``txt`` :param txt: String to search in :param begin: First string to be searched for :param end: Second string to be searched for after ``begin`` :param default: Default value if one of ``begin`` or ``end`` is not found. Defaults to an empty string. :return: The string between the two search-strings ``begin`` and ``end``. If at least one of ``begin`` or ``end`` is not found, the value of ``default`` is returned. Examples: >>> extr("abcde", "a", "e") "bcd" >>> extr("abcde", "a", "z", deafult="nothing") "nothing" """ # From https://github.com/mikf/gallery-dl/blob/master/gallery_dl/text.py#L129 try: first = txt.index(begin) + len(begin) return txt[first : txt.index(end, first)] except ValueError: return default
[docs] def int_or_zero(num: list[str] | str) -> int: """Convert num to int or 0. num can be either a str or a list. If num is a list, the first element is converted to int (or return 0 if the list is empty). If num is a str, see convert_str_to_int """ if isinstance(num, list): if len(num) < 1: return 0 num = num[0] return convert_str_to_int(num)
def load_module(filename: str, module_dir: str) -> types.ModuleType: modname = splitext(filename)[0] modpath = join(module_dir, filename) # and https://docs.python.org/3/library/importlib.html#importing-a-source-file-directly spec = importlib.util.spec_from_file_location(modname, modpath) if not spec: raise ValueError(f"Error loading '{modpath}' module") module = importlib.util.module_from_spec(spec) if not spec.loader: raise ValueError(f"Error loading '{modpath}' module") spec.loader.exec_module(module) return module
[docs] def to_string(obj: t.Any) -> str: """Convert obj to its string representation.""" if isinstance(obj, str): return obj if hasattr(obj, '__str__'): return str(obj) return repr(obj)
[docs] def ecma_unescape(string: str) -> str: """Python implementation of the unescape javascript function https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape Examples: >>> ecma_unescape('%u5409') '吉' >>> ecma_unescape('%20') ' ' >>> ecma_unescape('%F3') 'ó' """ # "%u5409" becomes "吉" string = _ECMA_UNESCAPE4_RE.sub(lambda e: chr(int(e.group(1), 16)), string) # "%20" becomes " ", "%F3" becomes "ó" string = _ECMA_UNESCAPE2_RE.sub(lambda e: chr(int(e.group(1), 16)), string) return string
[docs] def remove_pua_from_str(string: str): """Removes unicode's "PRIVATE USE CHARACTER"s (PUA_) from a string. .. _PUA: https://en.wikipedia.org/wiki/Private_Use_Areas """ pua_ranges = ((0xE000, 0xF8FF), (0xF0000, 0xFFFFD), (0x100000, 0x10FFFD)) s: list[str] = [] for c in string: i = ord(c) if any(a <= i <= b for (a, b) in pua_ranges): continue s.append(c) return "".join(s)
def get_string_replaces_function(replaces: dict[str, str]) -> Callable[[str], str]: rep = {re.escape(k): v for k, v in replaces.items()} pattern = re.compile("|".join(rep.keys())) def func(text: str): return pattern.sub(lambda m: rep[re.escape(m.group(0))], text) return func
[docs] def get_engine_from_settings(name: str) -> dict[str, dict[str, str]]: """Return engine configuration from settings.yml of a given engine name""" if 'engines' not in settings: return {} for engine in settings['engines']: if 'name' not in engine: continue if name == engine['name']: return engine return {}
[docs] def get_xpath(xpath_spec: XPathSpecType) -> XPath: """Return cached compiled :py:obj:`lxml.etree.XPath` object. ``TypeError``: Raised when ``xpath_spec`` is neither a :py:obj:`str` nor a :py:obj:`lxml.etree.XPath`. ``SearxXPathSyntaxException``: Raised when there is a syntax error in the *XPath* selector (``str``). """ if isinstance(xpath_spec, str): result = _XPATH_CACHE.get(xpath_spec, None) if result is None: try: result = XPath(xpath_spec) except XPathSyntaxError as e: raise SearxXPathSyntaxException(xpath_spec, str(e.msg)) from e _XPATH_CACHE[xpath_spec] = result return result if isinstance(xpath_spec, XPath): return xpath_spec raise TypeError('xpath_spec must be either a str or a lxml.etree.XPath') # pyright: ignore[reportUnreachable]
[docs] def eval_xpath(element: ElementType, xpath_spec: XPathSpecType) -> t.Any: """Equivalent of ``element.xpath(xpath_str)`` but compile ``xpath_str`` into a :py:obj:`lxml.etree.XPath` object once for all. The return value of ``xpath(..)`` is complex, read `XPath return values`_ for more details. .. _XPath return values: https://lxml.de/xpathxslt.html#xpath-return-values ``TypeError``: Raised when ``xpath_spec`` is neither a :py:obj:`str` nor a :py:obj:`lxml.etree.XPath`. ``SearxXPathSyntaxException``: Raised when there is a syntax error in the *XPath* selector (``str``). ``SearxEngineXPathException:`` Raised when the XPath can't be evaluated (masked :py:obj:`lxml.etree..XPathError`). """ xpath: XPath = get_xpath(xpath_spec) try: # https://lxml.de/xpathxslt.html#xpath-return-values return xpath(element) except XPathError as e: arg = ' '.join([str(i) for i in e.args]) raise SearxEngineXPathException(xpath_spec, arg) from e
[docs] def eval_xpath_list(element: ElementType, xpath_spec: XPathSpecType, min_len: int | None = None) -> list[t.Any]: """Same as :py:obj:`searx.utils.eval_xpath`, but additionally ensures the return value is a :py:obj:`list`. The minimum length of the list is also checked (if ``min_len`` is set).""" result: list[t.Any] = eval_xpath(element, xpath_spec) if not isinstance(result, list): raise SearxEngineXPathException(xpath_spec, 'the result is not a list') if min_len is not None and min_len > len(result): raise SearxEngineXPathException(xpath_spec, 'len(xpath_str) < ' + str(min_len)) return result
[docs] def eval_xpath_getindex( element: ElementType, xpath_spec: XPathSpecType, index: int, default: t.Any = _NOTSET, ) -> t.Any: """Same as :py:obj:`searx.utils.eval_xpath_list`, but returns item on position ``index`` from the list (index starts with ``0``). The exceptions known from :py:obj:`searx.utils.eval_xpath` are thrown. If a default is specified, this is returned if an element at position ``index`` could not be determined. """ result = eval_xpath_list(element, xpath_spec) if -len(result) <= index < len(result): return result[index] if default == _NOTSET: # raise an SearxEngineXPathException instead of IndexError to record # xpath_spec raise SearxEngineXPathException(xpath_spec, 'index ' + str(index) + ' not found') return default
[docs] def get_embeded_stream_url(url: str): """ Converts a standard video URL into its embed format. Supported services include Youtube, Facebook, Instagram, TikTok, Dailymotion, and Bilibili. """ parsed_url = urlparse(url) iframe_src = None # YouTube if parsed_url.netloc in ['www.youtube.com', 'youtube.com'] and parsed_url.path == '/watch' and parsed_url.query: video_id = parse_qs(parsed_url.query).get('v', []) if video_id: iframe_src = 'https://www.youtube-nocookie.com/embed/' + video_id[0] # Facebook elif parsed_url.netloc in ['www.facebook.com', 'facebook.com']: encoded_href = urlencode({'href': url}) iframe_src = 'https://www.facebook.com/plugins/video.php?allowfullscreen=true&' + encoded_href # Instagram elif parsed_url.netloc in ['www.instagram.com', 'instagram.com'] and parsed_url.path.startswith('/p/'): if parsed_url.path.endswith('/'): iframe_src = url + 'embed' else: iframe_src = url + '/embed' # TikTok elif ( parsed_url.netloc in ['www.tiktok.com', 'tiktok.com'] and parsed_url.path.startswith('/@') and '/video/' in parsed_url.path ): path_parts = parsed_url.path.split('/video/') video_id = path_parts[1] iframe_src = 'https://www.tiktok.com/embed/' + video_id # Dailymotion elif parsed_url.netloc in ['www.dailymotion.com', 'dailymotion.com'] and parsed_url.path.startswith('/video/'): path_parts = parsed_url.path.split('/') if len(path_parts) == 3: video_id = path_parts[2] iframe_src = 'https://www.dailymotion.com/embed/video/' + video_id # Bilibili elif parsed_url.netloc in ['www.bilibili.com', 'bilibili.com'] and parsed_url.path.startswith('/video/'): path_parts = parsed_url.path.split('/') video_id = path_parts[2] param_key = None if video_id.startswith('av'): video_id = video_id[2:] param_key = 'aid' elif video_id.startswith('BV'): param_key = 'bvid' iframe_src = ( f'https://player.bilibili.com/player.html?{param_key}={video_id}&high_quality=1&autoplay=false&danmaku=0' ) return iframe_src
def _j2p_process_escape(match: re.Match[str]) -> str: # deal with ECMA escape characters _escape = match.group(1) or match.group(2) return ( Rf'\{_escape}' if _escape in _JSON_PASSTHROUGH_ESCAPES else R'\u00' if _escape == 'x' else '' if _escape == '\n' else _escape ) def _j2p_decimal(match: re.Match[str]) -> str: return ( match.group(1) + match.group(2) + (match.group(3).replace("_", "") or "0") + "." + (match.group(4).replace("_", "") or "0") ) def _j2p_decimal2(match: re.Match[str]) -> str: return match.group(1) + match.group(2) + match.group(3).replace("_", "")
[docs] def js_obj_str_to_python(js_obj_str: str) -> t.Any: """Convert a javascript variable into JSON and then load the value It does not deal with all cases, but it is good enough for now. chompjs has a better implementation. """ s = js_obj_str_to_json_str(js_obj_str) # load the JSON and return the result if s == "": raise ValueError("js_obj_str can't be an empty string") try: return json.loads(s) except json.JSONDecodeError as e: logger.debug("Internal error: js_obj_str_to_python creates invalid JSON:\n%s", s) raise ValueError("js_obj_str_to_python creates invalid JSON") from e
def js_obj_str_to_json_str(js_obj_str: str) -> str: if not isinstance(js_obj_str, str): raise ValueError("js_obj_str must be of type str") if js_obj_str == "": raise ValueError("js_obj_str can't be an empty string") # when in_string is not None, it contains the character that has opened the string # either simple quote or double quote in_string = None # cut the string: # r"""{ a:"f\"irst", c:'sec"ond'}""" # becomes # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}'] parts = _JS_STRING_DELIMITERS.split(js_obj_str) # does the previous part ends with a backslash? blackslash_just_before = False for i, p in enumerate(parts): if p == in_string and not blackslash_just_before: # * the current part matches the character which has opened the string # * there is no antislash just before # --> the current part close the current string in_string = None # replace simple quote and ` by double quote # since JSON supports only double quote for string parts[i] = '"' elif in_string: # --> we are in a JS string # replace the colon by a temporary character # so _JS_QUOTE_KEYS_RE doesn't have to deal with colon inside the JS strings p = p.replace(':', chr(1)) # replace JS escape sequences by JSON escape sequences p = _JS_STRING_ESCAPE_RE.sub(_j2p_process_escape, p) # the JS string is delimited by simple quote. # This is not supported by JSON. # simple quote delimited string are converted to double quote delimited string # here, inside a JS string, we escape the double quote if in_string == "'": p = p.replace('"', r'\"') parts[i] = p # deal with the sequence blackslash then quote # since js_obj_str splits on quote, we detect this case: # * the previous part ends with a black slash # * the current part is a single quote # when detected the blackslash is removed on the previous part if blackslash_just_before and p[:1] == "'": parts[i - 1] = parts[i - 1][:-1] elif in_string is None and p in ('"', "'", "`"): # we are not in string but p is string delimiter # --> that's the start of a new string in_string = p # replace simple quote by double quote # since JSON supports only double quote for string parts[i] = '"' elif in_string is None: # we are not in a string # replace by null these values: # * void 0 # * void(0) # * undefined # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void p = _JS_VOID_OR_UNDEFINED_RE.sub("null", p) # make sure there is a leading zero in front of float p = _JS_DECIMAL_RE.sub(_j2p_decimal, p) p = _JS_DECIMAL2_RE.sub(_j2p_decimal2, p) # remove extra coma in a list or an object # for example [1,2,3,] becomes [1,2,3] p = _JS_EXTRA_COMA_RE.sub(lambda match: match.group(1), p) parts[i] = p # update for the next iteration blackslash_just_before = len(p) > 0 and p[-1] == '\\' # join the string s = ''.join(parts) # add quote arround the key # { a: 12 } # becomes # { "a": 12 } s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s) # replace the surogate character by colon and strip whitespaces s = s.replace(chr(1), ':').strip() return s
[docs] def parse_duration_string(duration_str: str) -> timedelta | None: """Parse a time string in format MM:SS or HH:MM:SS and convert it to a `timedelta` object. Returns None if the provided string doesn't match any of the formats. """ duration_str = duration_str.strip() if not duration_str: return None try: # prepending ["00"] here inits hours to 0 if they are not provided time_parts = (["00"] + duration_str.split(":"))[:3] hours, minutes, seconds = map(int, time_parts) return timedelta(hours=hours, minutes=minutes, seconds=seconds) except (ValueError, TypeError): pass return None