From 7b9793e2248900574b5cd3c46eb15fc426016088 Mon Sep 17 00:00:00 2001 From: n4ze3m Date: Fri, 23 Aug 2024 00:54:57 +0530 Subject: [PATCH] feat: add internet search integration --- docker/docker-compose.yml | 10 +- docker/searxng/limiter.toml | 1 + docker/searxng/settings.yml | 78 +++++++++ server/package.json | 1 + server/prisma/migrations/q_14_6/migration.sql | 2 + server/prisma/schema.prisma | 1 + server/src/internet/index.ts | 159 ++++++++++++++++++ server/src/utils/store.ts | 48 ++++-- server/yarn.lock | 2 +- 9 files changed, 290 insertions(+), 12 deletions(-) create mode 100644 docker/searxng/limiter.toml create mode 100644 docker/searxng/settings.yml create mode 100644 server/prisma/migrations/q_14_6/migration.sql create mode 100644 server/src/internet/index.ts diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 5189d600..17f50d01 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -10,11 +10,13 @@ services: environment: DATABASE_URL: postgres://postgres:postgres@dialoqbase-pg:5432/dialoqbase?connection_limit=15&pool_timeout=0 DB_REDIS_URL: redis://redis:6379 + DB_SEARXNG_URL: http://searxng:8080 env_file: - .env depends_on: - dialoqbase-pg - redis + - searxng volumes: - .uploads:/app/uploads @@ -34,4 +36,10 @@ services: container_name: redis restart: unless-stopped volumes: - - .redis:/data \ No newline at end of file + - .redis:/data + + searxng: + image: searxng/searxng + volumes: + - ./searxng:/etc/searxng:rw + restart: unless-stopped \ No newline at end of file diff --git a/docker/searxng/limiter.toml b/docker/searxng/limiter.toml new file mode 100644 index 00000000..bf1e9a85 --- /dev/null +++ b/docker/searxng/limiter.toml @@ -0,0 +1 @@ +#https://docs.searxng.org/admin/searx.limiter.html \ No newline at end of file diff --git a/docker/searxng/settings.yml b/docker/searxng/settings.yml new file mode 100644 index 00000000..81d6f92f --- /dev/null +++ b/docker/searxng/settings.yml @@ -0,0 +1,78 @@ +use_default_settings: true + +search: + # Filter results. 0: None, 1: Moderate, 2: Strict + safe_search: 0 + # Existing autocomplete backends: "dbpedia", "duckduckgo", "google", "yandex", "mwmbl", + # "seznam", "startpage", "stract", "swisscows", "qwant", "wikipedia" - leave blank to turn it off + # by default. + autocomplete: 'google' + # minimun characters to type before autocompleter starts + autocomplete_min: 4 + # Default search language - leave blank to detect from browser information or + # use codes from 'languages.py' + default_lang: 'auto' + # max_page: 0 # if engine supports paging, 0 means unlimited numbers of pages + # Available languages + # languages: + # - all + # - en + # - en-US + # - de + # - it-IT + # - fr + # - fr-BE + # ban time in seconds after engine errors + ban_time_on_fail: 5 + # max ban time in seconds after engine errors + max_ban_time_on_fail: 120 + suspended_times: + # Engine suspension time after error (in seconds; set to 0 to disable) + # For error "Access denied" and "HTTP error [402, 403]" + SearxEngineAccessDenied: 86400 + # For error "CAPTCHA" + SearxEngineCaptcha: 86400 + # For error "Too many request" and "HTTP error 429" + SearxEngineTooManyRequests: 3600 + # Cloudflare CAPTCHA + cf_SearxEngineCaptcha: 1296000 + cf_SearxEngineAccessDenied: 86400 + # ReCAPTCHA + recaptcha_SearxEngineCaptcha: 604800 + + # remove format to deny access, use lower case. + # formats: [html, csv, json, rss] + formats: + - html + - json + +server: + # Is overwritten by ${SEARXNG_PORT} and ${SEARXNG_BIND_ADDRESS} + port: 8888 + bind_address: '0.0.0.0' + # public URL of the instance, to ensure correct inbound links. Is overwritten + # by ${SEARXNG_URL}. + base_url: false # "http://example.com/location" + # rate limit the number of request on the instance, block some bots. + # Is overwritten by ${SEARXNG_LIMITER} + limiter: false + # enable features designed only for public instances. + # Is overwritten by ${SEARXNG_PUBLIC_INSTANCE} + public_instance: false + + # If your instance owns a /etc/searxng/settings.yml file, then set the following + # values there. + + secret_key: 'KDzXs0qvZdoZnzW7Eq4jhubjgTWayRM' # Is overwritten by ${SEARXNG_SECRET} + # Proxy image results through SearXNG. Is overwritten by ${SEARXNG_IMAGE_PROXY} + image_proxy: false + # 1.0 and 1.1 are supported + http_protocol_version: '1.0' + # POST queries are more secure as they don't show up in history but may cause + # problems when using Firefox containers + method: 'POST' + default_http_headers: + X-Content-Type-Options: nosniff + X-Download-Options: noopen + X-Robots-Tag: noindex, nofollow + Referrer-Policy: no-referrer \ No newline at end of file diff --git a/server/package.json b/server/package.json index 38494f84..b6d435af 100644 --- a/server/package.json +++ b/server/package.json @@ -79,6 +79,7 @@ "jsonwebtoken": "^9.0.2", "langchain": "^0.1.25", "mammoth": "^1.6.0", + "ml-distance": "^4.0.1", "pdf-parse": "^1.1.1", "pdfjs-dist": "^3.7.107", "pubsub-js": "^1.9.4", diff --git a/server/prisma/migrations/q_14_6/migration.sql b/server/prisma/migrations/q_14_6/migration.sql new file mode 100644 index 00000000..fe7e584c --- /dev/null +++ b/server/prisma/migrations/q_14_6/migration.sql @@ -0,0 +1,2 @@ +-- AlterTable +ALTER TABLE "User" ADD COLUMN "isSuspended" BOOLEAN NOT NULL DEFAULT false; diff --git a/server/prisma/schema.prisma b/server/prisma/schema.prisma index 87d495a0..c2212a29 100644 --- a/server/prisma/schema.prisma +++ b/server/prisma/schema.prisma @@ -87,6 +87,7 @@ model User { isFirstLogin Boolean @default(true) isAdministrator Boolean @default(false) createdAt DateTime @default(now()) + isSuspended Boolean @default(false) bots Bot[] apiKeys UserApiKey[] } diff --git a/server/src/internet/index.ts b/server/src/internet/index.ts new file mode 100644 index 00000000..f9c675e2 --- /dev/null +++ b/server/src/internet/index.ts @@ -0,0 +1,159 @@ +import * as cheerio from "cheerio"; +import { Embeddings } from "@langchain/core/embeddings"; +import { Document } from "@langchain/core/documents"; +import * as ml_distance from "ml-distance" + +const SERACH_PROVIDER = process.env.DB_SEARCH_PROVIDER || "default"; +const TOTAL_RESULTS_LIMIT = process.env.DB_TOTAL_RESULTS_LIMIT ? parseInt(process.env.DB_TOTAL_RESULTS_LIMIT) : 5; + +export const duckduckgoSearchUnOffical = async (query: string) => { + const abortController = new AbortController(); + setTimeout(() => abortController.abort(), 10000); + + const htmlString = await fetch( + "https://html.duckduckgo.com/html/?q=" + query, + { + signal: abortController.signal, + } + ) + .then((response) => response.text()) + .catch(); + + const $ = cheerio.load(htmlString); + + const searchResults = Array.from($("div.results_links_deep")).map( + (result) => { + const title = $(result).find("a.result__a").text(); + const link = $(result) + .find("a.result__snippet") + .attr("href") + .replace("//duckduckgo.com/l/?uddg=", "") + .replace(/&rut=.*/, ""); + + const content = $(result).find("a.result__snippet").text(); + const decodedLink = decodeURIComponent(link); + return { title, link: decodedLink, content }; + } + ); + + return searchResults; +}; + +export const googleSearchUnOffical = async (query: string) => { + const abortController = new AbortController(); + setTimeout(() => abortController.abort(), 10000); + + const htmlString = await fetch( + "https://www.google.com/search?hl=en&q=" + query, + { + signal: abortController.signal, + } + ) + .then((response) => response.text()) + .catch(); + + const $ = cheerio.load(htmlString); + + const searchResults = $("div.g").map((_, result) => { + const title = $(result).find("h3").text(); + const link = $(result).find("a").attr("href"); + const content = $(result).find("span").map((_, span) => $(span).text()).get().join(" "); + return { title, link, content }; + }).get(); + + return searchResults; +}; + +export const searxngSearch = async (query: string) => { + const abortController = new AbortController(); + setTimeout(() => abortController.abort(), 10000); + + const searxngUrl = process.env.DB_SEARXNG_URL; + + if (!searxngUrl) { + throw new Error("SEARXNG_URL is not set"); + } + const url = new URL(`${searxngUrl}/search`); + + url.searchParams.append("q", query); + url.searchParams.append("format", "json"); + const response = await fetch(url.toString(), { + method: "GET", + headers: { + Accept: "application/json", + }, + }); + + if (!response.ok) { + const err = await response.json(); + console.error(`Error: ${err}`); + throw new Error(`Error: ${response.status}`); + } + + const data = (await response.json()) as { + results: { + title: string; + url: string; + content: string; + }[]; + }; + + return data.results.map((result) => ({ + title: result.title, + link: result.url, + content: result.content, + })); +}; + +const searchProviders = { + duckduckgo: duckduckgoSearchUnOffical, + google: googleSearchUnOffical, + searxng: searxngSearch, + default: + process.env.IS_RAILWAY != "true" + ? searxngSearch + : duckduckgoSearchUnOffical, +}; + +export const searchInternet = async (embedding: Embeddings, { query }: { query: string }) => { + const searchProvider = searchProviders[SERACH_PROVIDER]; + if (!searchProvider) { + throw new Error(`Search provider ${SERACH_PROVIDER} not found`); + } + const datat = await searchProvider(query); + + const results = datat.slice(0, TOTAL_RESULTS_LIMIT); + + const [docEmbeddings, queryEmbedding] = await Promise.all([ + embedding.embedDocuments(results.map((doc) => doc.content)), + embedding.embedQuery(query), + ]); + + + const similarity = docEmbeddings.map((docEmbedding, i) => { + const sim = ml_distance.similarity.cosine(queryEmbedding, docEmbedding) + + return { + index: i, + similarity: sim + } + }) + + const sortedDocs = similarity + .sort((a, b) => b.similarity - a.similarity) + .filter((sim) => sim.similarity > 0.5) + .slice(0, 15) + .map((sim) => { + return [ + { + pageContent: results[sim.index]?.content || "", + metadata: { + source: results[sim.index]?.link || "", + } + } as Document, + sim.similarity + ] + }) + + return sortedDocs; +}; diff --git a/server/src/utils/store.ts b/server/src/utils/store.ts index 33bce9c3..84766a89 100644 --- a/server/src/utils/store.ts +++ b/server/src/utils/store.ts @@ -2,6 +2,8 @@ import { Document } from "@langchain/core/documents"; import { PrismaClient } from "@prisma/client"; import { Embeddings } from "@langchain/core/embeddings"; import { VectorStore } from "@langchain/core/vectorstores"; +import { Callbacks } from "langchain/callbacks"; +import { searchInternet } from "../internet"; const prisma = new PrismaClient(); export interface DialoqbaseLibArgs { botId: string; @@ -92,7 +94,8 @@ export class DialoqbaseVectorStore extends VectorStore { async similaritySearchVectorWithScore( query: number[], k: number, - filter?: this["FilterType"] | undefined + filter?: this["FilterType"] | undefined, + originalQuery?: string | undefined ): Promise<[Document>, number][]> { if (!query) { return []; @@ -114,10 +117,8 @@ export class DialoqbaseVectorStore extends VectorStore { const data = await prisma.$queryRaw` SELECT * FROM "similarity_search_v2"(query_embedding := ${vector}::vector, botId := ${bot_id}::text,match_count := ${match_count}::int) `; - - const result: [Document, number][] = ( - data as SearchEmbeddingsResponse[] - ).map((resp) => [ + + const result = (data as SearchEmbeddingsResponse[]).map((resp) => [ new Document({ metadata: resp.metadata, pageContent: resp.content, @@ -125,15 +126,42 @@ export class DialoqbaseVectorStore extends VectorStore { resp.similarity, ]); + let internetSearchResults = []; + if (botInfo.internetSearchEnabled) { + internetSearchResults = await searchInternet(this.embeddings, { + query: originalQuery, + }); + } + + const combinedResults = [...result, ...internetSearchResults]; + combinedResults.sort((a, b) => b[1] - a[1]); + + const topResults = combinedResults.slice(0, k); + if (semanticSearchSimilarityScore === "none") { - return result; + return topResults; } - const valueInFloat = parseFloat(semanticSearchSimilarityScore); - const filteredResult = result.filter( - ([, similarity]) => similarity >= valueInFloat + const similarityThreshold = parseFloat(semanticSearchSimilarityScore); + const filteredResults = topResults.filter( + ([, similarity]) => similarity >= similarityThreshold + ); + return filteredResults; + } + + async similaritySearch( + query: string, + k = 4, + filter: this["FilterType"] | undefined = undefined, + _callbacks: Callbacks | undefined = undefined // implement passing to embedQuery later + ): Promise { + const results = await this.similaritySearchVectorWithScore( + await this.embeddings.embedQuery(query), + k, + filter, + query ); - return filteredResult; + return results.map((result) => result[0]); } _vectorstoreType(): string { diff --git a/server/yarn.lock b/server/yarn.lock index 26437ebc..a86a3b33 100644 --- a/server/yarn.lock +++ b/server/yarn.lock @@ -5552,7 +5552,7 @@ ml-distance-euclidean@^2.0.0: resolved "https://registry.yarnpkg.com/ml-distance-euclidean/-/ml-distance-euclidean-2.0.0.tgz#3a668d236649d1b8fec96380b9435c6f42c9a817" integrity sha512-yC9/2o8QF0A3m/0IXqCTXCzz2pNEzvmcE/9HFKOZGnTjatvBbsn4lWYJkxENkA4Ug2fnYl7PXQxnPi21sgMy/Q== -ml-distance@^4.0.0: +ml-distance@^4.0.0, ml-distance@^4.0.1: version "4.0.1" resolved "https://registry.yarnpkg.com/ml-distance/-/ml-distance-4.0.1.tgz#4741d17a1735888c5388823762271dfe604bd019" integrity sha512-feZ5ziXs01zhyFUUUeZV5hwc0f5JW0Sh0ckU1koZe/wdVkJdGxcP06KNQuF0WBTj8FttQUzcvQcpcrOp/XrlEw==