Skip to content

Commit

Permalink
feat: add internet search integration
Browse files Browse the repository at this point in the history
  • Loading branch information
n4ze3m committed Aug 22, 2024
1 parent f7ff1c3 commit 7b9793e
Show file tree
Hide file tree
Showing 9 changed files with 290 additions and 12 deletions.
10 changes: 9 additions & 1 deletion docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,13 @@ services:
environment:
DATABASE_URL: postgres://postgres:postgres@dialoqbase-pg:5432/dialoqbase?connection_limit=15&pool_timeout=0
DB_REDIS_URL: redis://redis:6379
DB_SEARXNG_URL: http://searxng:8080
env_file:
- .env
depends_on:
- dialoqbase-pg
- redis
- searxng
volumes:
- .uploads:/app/uploads

Expand All @@ -34,4 +36,10 @@ services:
container_name: redis
restart: unless-stopped
volumes:
- .redis:/data
- .redis:/data

searxng:
image: searxng/searxng
volumes:
- ./searxng:/etc/searxng:rw
restart: unless-stopped
1 change: 1 addition & 0 deletions docker/searxng/limiter.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
#https://docs.searxng.org/admin/searx.limiter.html
78 changes: 78 additions & 0 deletions docker/searxng/settings.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
use_default_settings: true

search:
# Filter results. 0: None, 1: Moderate, 2: Strict
safe_search: 0
# Existing autocomplete backends: "dbpedia", "duckduckgo", "google", "yandex", "mwmbl",
# "seznam", "startpage", "stract", "swisscows", "qwant", "wikipedia" - leave blank to turn it off
# by default.
autocomplete: 'google'
# minimun characters to type before autocompleter starts
autocomplete_min: 4
# Default search language - leave blank to detect from browser information or
# use codes from 'languages.py'
default_lang: 'auto'
# max_page: 0 # if engine supports paging, 0 means unlimited numbers of pages
# Available languages
# languages:
# - all
# - en
# - en-US
# - de
# - it-IT
# - fr
# - fr-BE
# ban time in seconds after engine errors
ban_time_on_fail: 5
# max ban time in seconds after engine errors
max_ban_time_on_fail: 120
suspended_times:
# Engine suspension time after error (in seconds; set to 0 to disable)
# For error "Access denied" and "HTTP error [402, 403]"
SearxEngineAccessDenied: 86400
# For error "CAPTCHA"
SearxEngineCaptcha: 86400
# For error "Too many request" and "HTTP error 429"
SearxEngineTooManyRequests: 3600
# Cloudflare CAPTCHA
cf_SearxEngineCaptcha: 1296000
cf_SearxEngineAccessDenied: 86400
# ReCAPTCHA
recaptcha_SearxEngineCaptcha: 604800

# remove format to deny access, use lower case.
# formats: [html, csv, json, rss]
formats:
- html
- json

server:
# Is overwritten by ${SEARXNG_PORT} and ${SEARXNG_BIND_ADDRESS}
port: 8888
bind_address: '0.0.0.0'
# public URL of the instance, to ensure correct inbound links. Is overwritten
# by ${SEARXNG_URL}.
base_url: false # "http://example.com/location"
# rate limit the number of request on the instance, block some bots.
# Is overwritten by ${SEARXNG_LIMITER}
limiter: false
# enable features designed only for public instances.
# Is overwritten by ${SEARXNG_PUBLIC_INSTANCE}
public_instance: false

# If your instance owns a /etc/searxng/settings.yml file, then set the following
# values there.

secret_key: 'KDzXs0qvZdoZnzW7Eq4jhubjgTWayRM' # Is overwritten by ${SEARXNG_SECRET}
# Proxy image results through SearXNG. Is overwritten by ${SEARXNG_IMAGE_PROXY}
image_proxy: false
# 1.0 and 1.1 are supported
http_protocol_version: '1.0'
# POST queries are more secure as they don't show up in history but may cause
# problems when using Firefox containers
method: 'POST'
default_http_headers:
X-Content-Type-Options: nosniff
X-Download-Options: noopen
X-Robots-Tag: noindex, nofollow
Referrer-Policy: no-referrer
1 change: 1 addition & 0 deletions server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@
"jsonwebtoken": "^9.0.2",
"langchain": "^0.1.25",
"mammoth": "^1.6.0",
"ml-distance": "^4.0.1",
"pdf-parse": "^1.1.1",
"pdfjs-dist": "^3.7.107",
"pubsub-js": "^1.9.4",
Expand Down
2 changes: 2 additions & 0 deletions server/prisma/migrations/q_14_6/migration.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
-- AlterTable
ALTER TABLE "User" ADD COLUMN "isSuspended" BOOLEAN NOT NULL DEFAULT false;
1 change: 1 addition & 0 deletions server/prisma/schema.prisma
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ model User {
isFirstLogin Boolean @default(true)
isAdministrator Boolean @default(false)
createdAt DateTime @default(now())
isSuspended Boolean @default(false)
bots Bot[]
apiKeys UserApiKey[]
}
Expand Down
159 changes: 159 additions & 0 deletions server/src/internet/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
import * as cheerio from "cheerio";
import { Embeddings } from "@langchain/core/embeddings";
import { Document } from "@langchain/core/documents";
import * as ml_distance from "ml-distance"

const SERACH_PROVIDER = process.env.DB_SEARCH_PROVIDER || "default";
const TOTAL_RESULTS_LIMIT = process.env.DB_TOTAL_RESULTS_LIMIT ? parseInt(process.env.DB_TOTAL_RESULTS_LIMIT) : 5;

export const duckduckgoSearchUnOffical = async (query: string) => {
const abortController = new AbortController();
setTimeout(() => abortController.abort(), 10000);

const htmlString = await fetch(
"https://html.duckduckgo.com/html/?q=" + query,
{
signal: abortController.signal,
}
)
.then((response) => response.text())
.catch();

const $ = cheerio.load(htmlString);

const searchResults = Array.from($("div.results_links_deep")).map(
(result) => {
const title = $(result).find("a.result__a").text();
const link = $(result)
.find("a.result__snippet")
.attr("href")
.replace("//duckduckgo.com/l/?uddg=", "")
.replace(/&rut=.*/, "");

const content = $(result).find("a.result__snippet").text();
const decodedLink = decodeURIComponent(link);
return { title, link: decodedLink, content };
}
);

return searchResults;
};

export const googleSearchUnOffical = async (query: string) => {
const abortController = new AbortController();
setTimeout(() => abortController.abort(), 10000);

const htmlString = await fetch(
"https://www.google.com/search?hl=en&q=" + query,
{
signal: abortController.signal,
}
)
.then((response) => response.text())
.catch();

const $ = cheerio.load(htmlString);

const searchResults = $("div.g").map((_, result) => {
const title = $(result).find("h3").text();
const link = $(result).find("a").attr("href");
const content = $(result).find("span").map((_, span) => $(span).text()).get().join(" ");
return { title, link, content };
}).get();

return searchResults;
};

export const searxngSearch = async (query: string) => {
const abortController = new AbortController();
setTimeout(() => abortController.abort(), 10000);

const searxngUrl = process.env.DB_SEARXNG_URL;

if (!searxngUrl) {
throw new Error("SEARXNG_URL is not set");
}
const url = new URL(`${searxngUrl}/search`);

url.searchParams.append("q", query);
url.searchParams.append("format", "json");
const response = await fetch(url.toString(), {
method: "GET",
headers: {
Accept: "application/json",
},
});

if (!response.ok) {
const err = await response.json();
console.error(`Error: ${err}`);
throw new Error(`Error: ${response.status}`);
}

const data = (await response.json()) as {
results: {
title: string;
url: string;
content: string;
}[];
};

return data.results.map((result) => ({
title: result.title,
link: result.url,
content: result.content,
}));
};

const searchProviders = {
duckduckgo: duckduckgoSearchUnOffical,
google: googleSearchUnOffical,
searxng: searxngSearch,
default:
process.env.IS_RAILWAY != "true"
? searxngSearch
: duckduckgoSearchUnOffical,
};

export const searchInternet = async (embedding: Embeddings, { query }: { query: string }) => {
const searchProvider = searchProviders[SERACH_PROVIDER];
if (!searchProvider) {
throw new Error(`Search provider ${SERACH_PROVIDER} not found`);
}
const datat = await searchProvider(query);

const results = datat.slice(0, TOTAL_RESULTS_LIMIT);

const [docEmbeddings, queryEmbedding] = await Promise.all([
embedding.embedDocuments(results.map((doc) => doc.content)),
embedding.embedQuery(query),
]);


const similarity = docEmbeddings.map((docEmbedding, i) => {
const sim = ml_distance.similarity.cosine(queryEmbedding, docEmbedding)

return {
index: i,
similarity: sim
}
})

const sortedDocs = similarity
.sort((a, b) => b.similarity - a.similarity)
.filter((sim) => sim.similarity > 0.5)
.slice(0, 15)
.map((sim) => {
return [
{
pageContent: results[sim.index]?.content || "",
metadata: {
source: results[sim.index]?.link || "",
}
} as Document,
sim.similarity
]
})

return sortedDocs;
};
48 changes: 38 additions & 10 deletions server/src/utils/store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ import { Document } from "@langchain/core/documents";
import { PrismaClient } from "@prisma/client";
import { Embeddings } from "@langchain/core/embeddings";
import { VectorStore } from "@langchain/core/vectorstores";
import { Callbacks } from "langchain/callbacks";
import { searchInternet } from "../internet";
const prisma = new PrismaClient();
export interface DialoqbaseLibArgs {
botId: string;
Expand Down Expand Up @@ -92,7 +94,8 @@ export class DialoqbaseVectorStore extends VectorStore {
async similaritySearchVectorWithScore(
query: number[],
k: number,
filter?: this["FilterType"] | undefined
filter?: this["FilterType"] | undefined,
originalQuery?: string | undefined
): Promise<[Document<Record<string, any>>, number][]> {
if (!query) {
return [];
Expand All @@ -114,26 +117,51 @@ export class DialoqbaseVectorStore extends VectorStore {
const data = await prisma.$queryRaw`
SELECT * FROM "similarity_search_v2"(query_embedding := ${vector}::vector, botId := ${bot_id}::text,match_count := ${match_count}::int)
`;

const result: [Document, number][] = (
data as SearchEmbeddingsResponse[]
).map((resp) => [

const result = (data as SearchEmbeddingsResponse[]).map((resp) => [
new Document({
metadata: resp.metadata,
pageContent: resp.content,
}),
resp.similarity,
]);

let internetSearchResults = [];
if (botInfo.internetSearchEnabled) {
internetSearchResults = await searchInternet(this.embeddings, {
query: originalQuery,
});
}

const combinedResults = [...result, ...internetSearchResults];
combinedResults.sort((a, b) => b[1] - a[1]);

const topResults = combinedResults.slice(0, k);

if (semanticSearchSimilarityScore === "none") {
return result;
return topResults;
}

const valueInFloat = parseFloat(semanticSearchSimilarityScore);
const filteredResult = result.filter(
([, similarity]) => similarity >= valueInFloat
const similarityThreshold = parseFloat(semanticSearchSimilarityScore);
const filteredResults = topResults.filter(
([, similarity]) => similarity >= similarityThreshold
);
return filteredResults;
}

async similaritySearch(
query: string,
k = 4,
filter: this["FilterType"] | undefined = undefined,
_callbacks: Callbacks | undefined = undefined // implement passing to embedQuery later
): Promise<any[]> {
const results = await this.similaritySearchVectorWithScore(
await this.embeddings.embedQuery(query),
k,
filter,
query
);
return filteredResult;
return results.map((result) => result[0]);
}

_vectorstoreType(): string {
Expand Down
2 changes: 1 addition & 1 deletion server/yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -5552,7 +5552,7 @@ ml-distance-euclidean@^2.0.0:
resolved "https://registry.yarnpkg.com/ml-distance-euclidean/-/ml-distance-euclidean-2.0.0.tgz#3a668d236649d1b8fec96380b9435c6f42c9a817"
integrity sha512-yC9/2o8QF0A3m/0IXqCTXCzz2pNEzvmcE/9HFKOZGnTjatvBbsn4lWYJkxENkA4Ug2fnYl7PXQxnPi21sgMy/Q==

ml-distance@^4.0.0:
ml-distance@^4.0.0, ml-distance@^4.0.1:
version "4.0.1"
resolved "https://registry.yarnpkg.com/ml-distance/-/ml-distance-4.0.1.tgz#4741d17a1735888c5388823762271dfe604bd019"
integrity sha512-feZ5ziXs01zhyFUUUeZV5hwc0f5JW0Sh0ckU1koZe/wdVkJdGxcP06KNQuF0WBTj8FttQUzcvQcpcrOp/XrlEw==
Expand Down

0 comments on commit 7b9793e

Please sign in to comment.