# SPDX-License-Identifier: AGPL-3.0-or-later
"""This is the implementation of the Bing-Web engine. Some of this
implementations are shared by other engines:
- :ref:`bing images engine`
- :ref:`bing news engine`
- :ref:`bing videos engine`
.. note::
Some functionality (paging and time-range results) are not supported
since they depend on JavaScript.
"""
import base64
import re
import typing as t
from urllib.parse import parse_qs, urlencode, urlparse
import babel
import babel.languages
from lxml import html
from searx.enginelib.traits import EngineTraits
from searx.locales import region_tag
from searx.utils import eval_xpath, eval_xpath_getindex, eval_xpath_list, extract_text
if t.TYPE_CHECKING:
from searx.extended_types import SXNG_Response
from searx.search.processors import OnlineParams
about: dict[str, t.Any] = {
"website": "https://www.bing.com",
"wikidata_id": "Q182496",
"official_api_documentation": "https://github.com/MicrosoftDocs/bing-docs",
"use_official_api": False,
"require_api_key": False,
"results": "HTML",
}
# engine dependent config
categories = ["general", "web"]
safesearch = True
_safesearch_map: dict[int, str] = {
0: "off",
1: "moderate",
2: "strict",
}
"""Filter results. 0: None, 1: Moderate, 2: Strict"""
base_url = "https://www.bing.com/search"
"""Bing-Web search URL"""
[docs]
def get_locale_params(engine_region: str | None) -> dict[str, str] | None:
"""API documentation states the ``mkt`` parameter is *the
recommended primary signal* for locale:
If known, you are encouraged to always specify the market.
Specifying the market helps Bing route the request and return an
appropriate and optimal response.
The ``mkt`` parameter takes a full ``<language>-<country>`` code.
This function is shared with :py:mod:`searx.engines.bing_images`,
:py:mod:`searx.engines.bing_news`, and :py:mod:`searx.engines.bing_videos`.
"""
if not engine_region or engine_region == "clear":
return None
return {"mkt": engine_region}
[docs]
def override_accept_language(params: "OnlineParams", engine_region: str | None) -> None:
"""Override the ``Accept-Language`` header.
The default header built by :py:class:`~searx.search.processors.online.OnlineProcessor`
appends ``en;q=0.3`` as a fallback language::
Accept-Language: de,de-DE;q=0.7,en;q=0.3
Bing seems to better select the results locale based on the
``Accept-Language`` value header.
This function is shared with :py:mod:`searx.engines.bing_images`,
:py:mod:`searx.engines.bing_news`, and :py:mod:`searx.engines.bing_videos`.
"""
if not engine_region or engine_region == "clear":
return
lang = engine_region.split("-")[0]
params["headers"]["Accept-Language"] = f"{engine_region},{lang};q=0.9"
[docs]
def request(query: str, params: "OnlineParams") -> "OnlineParams":
"""Assemble a Bing-Web request."""
engine_region = traits.get_region(params["searxng_locale"], traits.all_locale)
override_accept_language(params, engine_region)
query_params: dict[str, str | int] = {
"q": query,
"adlt": _safesearch_map.get(params.get("safesearch", 0), "off"),
}
locale_params = get_locale_params(engine_region)
if locale_params:
query_params.update(locale_params)
params["url"] = f"{base_url}?{urlencode(query_params)}"
# in some regions where geoblocking is employed (e.g. China),
# www.bing.com redirects to the regional version of Bing
params["allow_redirects"] = True
return params
[docs]
def response(resp: "SXNG_Response") -> list[dict[str, t.Any]]:
"""Get response from Bing-Web"""
results: list[dict[str, t.Any]] = []
dom = html.fromstring(resp.text)
for item in eval_xpath_list(dom, '//ol[@id="b_results"]/li[contains(@class, "b_algo")]'):
link = eval_xpath_getindex(item, ".//h2/a", 0, None)
if link is None:
continue
href = link.attrib.get("href", "")
title = extract_text(link)
if not href or not title:
continue
# what about cn.bing.com, ..?
if href.startswith("https://www.bing.com/ck/a?"):
qs = parse_qs(urlparse(href).query)
u_values = qs.get("u")
if u_values:
u_val = u_values[0]
if u_val.startswith("a1"):
encoded = u_val[2:]
# base64url without padding
encoded += "=" * (-len(encoded) % 4)
href = base64.urlsafe_b64decode(encoded).decode("utf-8", errors="replace")
# remove decorative icons that Bing injects into <p> elements
# (`<span class="algoSlug_icon">`)
content_els = eval_xpath(item, ".//p")
for p in content_els:
for icon in p.xpath('.//span[@class="algoSlug_icon"]'):
icon.getparent().remove(icon)
content = extract_text(content_els)
results.append({"url": href, "title": title, "content": content})
if results:
result_len_container = "".join(eval_xpath(dom, '//span[@class="sb_count"]//text()'))
result_len_container = re.sub(r"[^0-9]", "", result_len_container)
if result_len_container:
results.append({"number_of_results": int(result_len_container)})
return results
[docs]
def fetch_traits(engine_traits: EngineTraits) -> None:
"""Fetch regions from Bing-Web."""
# pylint: disable=import-outside-toplevel
from searx.network import get # see https://github.com/searxng/searxng/issues/762
from searx.utils import gen_useragent
headers = {
"User-Agent": gen_useragent(),
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Language": "en-US;q=0.5,en;q=0.3",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-GPC": "1",
"Cache-Control": "max-age=0",
}
resp = get("https://www.bing.com/account/general", headers=headers, timeout=5)
if not resp.ok:
raise RuntimeError("Response from Bing is not OK.")
dom = html.fromstring(resp.text)
map_market_codes: dict[str, str] = {
"zh-hk": "en-hk", # not sure why, but at Microslop this is the market code for Hongkong
}
for href in eval_xpath(dom, '//div[@id="region-section-content"]//div[@class="regionItem"]/a/@href'):
cc_tag = parse_qs(urlparse(href).query)["cc"][0]
if cc_tag == "clear":
engine_traits.all_locale = cc_tag
continue
# add market codes from official languages of the country ..
for lang_tag in babel.languages.get_official_languages(cc_tag, de_facto=True):
lang_tag = lang_tag.split("_")[0] # zh_Hant --> zh
market_code = f"{lang_tag}-{cc_tag}" # zh-tw
market_code = map_market_codes.get(market_code, market_code)
try:
sxng_tag = region_tag(babel.Locale.parse("%s_%s" % (lang_tag, cc_tag.upper())))
except babel.UnknownLocaleError:
# silently ignore unknown languages
continue
conflict = engine_traits.regions.get(sxng_tag)
if conflict:
if conflict != market_code:
print("CONFLICT: babel %s --> %s, %s" % (sxng_tag, conflict, market_code))
continue
engine_traits.regions[sxng_tag] = market_code