sqlite3 search

This commit is contained in:
Yaroslav Polyakov 2025-04-16 01:10:25 +07:00
parent 17bf1235b4
commit ee4a12e746
14 changed files with 213 additions and 21 deletions

11
contrib/create.sql Normal file
View file

@ -0,0 +1,11 @@
CREATE TABLE company (
oid TEXT PRIMARY KEY,
title TEXT,
address TEXT,
town TEXT,
searchstr TEXT,
rating_2gis REAL,
trusted BOOLEAN,
nreviews INTEGER,
detections TEXT
);

View file

@ -27,7 +27,8 @@ from ..exceptions import AFReportNotReady, AFNoCompany
from ..tasks import submit_fraud_task, get_qsize
from ..settings import settings
from ..const import REDIS_TASK_QUEUE_NAME, REDIS_TRUSTED_LIST, REDIS_UNTRUSTED_LIST, REDIS_WORKER_STATUS
from ..search import search
# from ..search import search
from ..companydb import dbsearch
app = FastAPI()
@ -217,15 +218,14 @@ async def submit(request: Request, oid: str = Form(...), force: bool = Form(Fals
@app.post("/search", response_class=HTMLResponse)
async def search_view(request: Request, query: str = Form(...)):
# 15 actually
@app.get("/search", response_class=HTMLResponse)
async def search_view(request: Request, query: str):
if query.isdigit() and len(query) >= 12:
print(f"redirect by id {query!r}")
return RedirectResponse(app.url_path_for("report", oid=query), status_code=303)
else:
print(f"search for {query!r}")
results = search(query, limit=25)
# results = search(query, limit=25)
results = dbsearch(query, limit=25)
print(f"got {len(results)} results for {query!r}")
last_trusted = [json.loads(item) for item in r.lrange(REDIS_TRUSTED_LIST, 0, -1)]

View file

@ -25,6 +25,7 @@ from ..const import REDIS_TASK_QUEUE_NAME, REDIS_TRUSTED_LIST, REDIS_UNTRUSTED_L
from ..logger import logger
from ..session import session
from ..utils import random_company
from ..companydb import add_company, check_by_oid, get_by_oid, dbsearch
def countdown(n=5):
for i in range(n, 0, -1):
@ -144,7 +145,7 @@ def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("cmd", choices=['company-users', 'user-reviews', 'queue', 'explore', 'ip'])
parser.add_argument("cmd", choices=['company-users', 'user-reviews', 'queue', 'explore', 'ip', 'filldb'])
parser.add_argument("-v", "--verbose", default=False, action='store_true')
parser.add_argument("--full", default=False, action='store_true')
parser.add_argument("args", nargs='*', help='extra args')
@ -214,7 +215,6 @@ def main():
print(f"Python: {sys.version}")
print(f"HTTPS_PROXY env variable: {os.getenv('HTTPS_PROXY', None)}")
r = requests.get("https://ipinfo.io/ip", proxies={"https": None, "http": None})
print(f"Direct IP: {r.text}")
@ -244,6 +244,25 @@ def main():
print(f"Meta code: {data['meta']['code']}, rating:{data['meta']['branch_rating']} count: {data['meta']['branch_reviews_count']}/{data['meta']['total_count']}")
print(f"Reviews: {len(data['reviews'])}")
elif cmd == "filldb":
inserted = 0
exist = 0
skipped = 0
for c in cl.companies(oid=args.company, name=args.name, town=args.town, report=args.report, noreport=args.noreport):
if skipped < 1210:
skipped += 1
continue
if check_by_oid(c.object_id):
exist += 1
else:
inserted += 1
print(f"{inserted} add {c.object_id} {c.title}")
add_company(c.export())
print(f"Done. Inserted {inserted} records, {exist} already exists.")
elif cmd == "explore":
if args.town is None:

View file

@ -28,7 +28,8 @@ from ..exceptions import AFNoCompany, AFReportNotReady, AFReportAlreadyExists
from ..settings import settings
from ..statistics import statistics
from ..aliases import resolve_alias
from ..search import search
# from ..search import search
from ..companydb import dbsearch
# CLI
from .summary import printsummary
@ -125,7 +126,7 @@ def main():
print(f"{len(report['relations'])} relations")
elif args.cmd == "search":
res = search(args.args[0])
res = dbsearch(args.args[0])
for rec in res:
print(rec)
@ -133,7 +134,7 @@ def main():
elif args.cmd in ["list", "fraud", "delreport", "wipe", "submitfraud", "export"]:
# sanity check
if args.cmd in ["submitfraud", "delreport", "wipe"] and not any_filter(args):
if args.cmd in ["submitfraud", "fraud", "delreport", "wipe"] and not any_filter(args):
print(f"Need company filter for {args.cmd}")
sys.exit(1)

View file

@ -181,10 +181,15 @@ class Company:
return len(self._reviews)
def count_rate(self):
self.ratings = list()
for r in self._reviews:
if r['rating'] is None:
print_json(data=r)
# 70000001006412601
# rating could be None e.g. when provider=4sq
continue
if r['rating'] is not None:
self.ratings.append(r['rating'])
@ -254,12 +259,16 @@ class Company:
r.raise_for_status()
data = r.json()
# branch review may exists, but not total_count
# 70000001028529798
if self.total_count_2gis is None:
self.total_count_2gis = data['meta']['total_count']
self.branch_count_2gis = data['meta']['branch_reviews_count']
self.branch_rating_2gis = data['meta']['branch_rating']
# print(f"Total/Branch reviews count: {self.total_count_2gis}/{self.branch_count_2gis}")
if self.total_count_2gis == 0 or self.branch_count_2gis == 0:
raise AFNoCompany(f"No reviews for {self.object_id}")
@ -343,6 +352,7 @@ class Company:
'rating_2gis': self.branch_rating_2gis,
'trusted': self.trusted,
'nreviews': self.nreviews(),
'detections': ' '.join(self.detections)
}
if self.trusted is None and self.report_path.exists():

View file

@ -0,0 +1,86 @@
import sqlite3
from .settings import settings
from typing import Optional
from rich import print_json
"""
CREATE TABLE company (
oid TEXT PRIMARY KEY,
title TEXT,
address TEXT,
town TEXT,
searchstr TEXT,
rating_2gis REAL,
trusted BOOLEAN,
nreviews INTEGER,
detections TEXT
);
"""
def make_connection():
return sqlite3.connect(settings.companydb)
# Function to check if oid exists in the "company" table
def check_by_oid(oid: str, conn = None):
conn = conn or make_connection()
cursor = conn.cursor()
cursor.execute("SELECT 1 FROM company WHERE oid = ?", (oid,))
result = cursor.fetchone()
return result is not None # If result is None, the oid doesn't exist
def get_by_oid(oid: str, conn = None) -> Optional[dict]:
conn = conn or make_connection()
cursor = conn.cursor()
cursor.execute("SELECT * FROM company WHERE oid = ?", (oid,))
row = cursor.fetchone()
if row is None:
return None
# Map column names to values
col_names = [desc[0] for desc in cursor.description]
return dict(zip(col_names, row))
def dbsearch(query: str, limit=20, conn = None) -> list[dict]:
conn = conn or make_connection()
words = query.strip().lower().split()
if not words:
return []
clauses = " AND ".join(["searchstr LIKE ? "] * len(words))
params = [f"%{w}%" for w in words]
sql = f"SELECT * FROM company WHERE {clauses} LIMIT {limit}"
cursor = conn.cursor()
cursor.execute(sql, params)
rows = cursor.fetchall()
col_names = [desc[0] for desc in cursor.description]
return [dict(zip(col_names, row)) for row in rows]
def add_company(company_data: dict, conn = None):
conn = conn or make_connection()
cursor = conn.cursor()
# Define the SQL statement with placeholders (hardcoded columns)
sql = """
REPLACE INTO company (oid, title, address, town, searchstr, rating_2gis, trusted, nreviews, detections)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
"""
# Extract values from the dictionary and map them to the placeholders
cursor.execute(sql, (
company_data.get("oid"),
company_data.get("title"),
company_data.get("address"),
company_data.get("town"),
company_data.get("searchstr").lower(),
company_data.get("rating_2gis"),
company_data.get("trusted"),
company_data.get("nreviews"),
company_data.get("detections")
))
# Commit the transaction
conn.commit()

View file

@ -16,12 +16,14 @@ from .db import db
from .const import WSCORE_THRESHOLD, WSCORE_HITS_THRESHOLD, MAX_USER_REVIEWS
from .logger import logger
from .company import Company, CompanyList
from .companydb import add_company, get_by_oid, check_by_oid
from .user import User, get_user
from .relation import RelationDict, _is_dangerous
from .settings import settings
from .exceptions import AFReportNotReady, AFNoCompany, AFReportAlreadyExists
# from .usernotes import Usernotes
from .fd.master import MasterFD
from .search import company_indexed, index_company
def detect(c: Company, cl: CompanyList, explain: bool = False, force=False):
@ -63,6 +65,12 @@ def detect(c: Company, cl: CompanyList, explain: bool = False, force=False):
report = dict()
report['score'] = score
report['relations'] = list()
c.trusted = True
c.detections = list()
add_company(c.export())
with gzip.open(c.report_path, "wt") as fh:
json.dump(report, fh)
@ -111,6 +119,8 @@ def detect(c: Company, cl: CompanyList, explain: bool = False, force=False):
dnames = [ dline.split(' ')[0] for dline in score['detections'] ]
trust_line = f"RISK {len(dnames)} {'+'.join(dnames)}"
add_company(c.export())
logger.info(f"DETECTION RESULT {c} {trust_line}")
return score

View file

@ -1,7 +1,9 @@
import subprocess
from .settings import settings
from .company import Company
import json
import fcntl
def is_safe_search(query):
# Проверяем, что строка состоит из букв, цифр и пробелов
@ -28,3 +30,31 @@ def search(query: str, limit=50):
except json.JSONDecodeError:
print("Error decoding JSON:", r.stdout)
return list()
def index_company(c: Company):
if company_indexed(c.object_id):
print(f"{c.object_id} already indexed")
return
# tmp
return
with open(settings.searchnew, "a", encoding="utf-8") as fh:
fcntl.flock(fh, fcntl.LOCK_EX)
fh.write(json.dumps(c.export()) + "\n")
fcntl.flock(fh, fcntl.LOCK_UN)
def company_indexed(oid: str, path = None):
if path is None:
for index in [settings.search, settings.searchnew]:
if company_indexed(oid, index):
print(f"{oid} already indexed in {index}")
return True
return False
if not path.exists():
return False
query = f'(.oid == "{oid}")'
r = subprocess.run(['jq', '-c', f'. | select({query})', path], capture_output=True, text=True)
print(f"indexed? {path} {r.returncode} {len(r.stdout)}")

View file

@ -9,7 +9,9 @@ class Settings():
self.user_storage = self.storage / "users"
self.private_user_storage = self.storage / "users" / "_private.json"
self.company_storage = self.storage / "companies"
self.search = self.storage / "search.jsonl"
# self.search = self.storage / "search.jsonl"
# self.searchnew = self.storage / "searchnew.jsonl"
self.companydb = self.storage / "companies.db"
# trust company if <= min_reviews
self.min_reviews = int(os.getenv('MIN_REVIEWS', '20'))

View file

@ -27,8 +27,17 @@ function turnstileCallback(){
submit_btn.disabled = false;
}
function make_toggle_link(){
document.getElementById("toggle-link")?.addEventListener("click", function(e) {
e.preventDefault();
document.getElementById("recalc-box").classList.toggle("show");
});
}
function main(){
make_auto_refresh()
make_auto_refresh();
make_toggle_link();
}
main()

View file

@ -386,4 +386,17 @@ li {
.cf-turnstile {
width: 300px;
height: 65px;
}
}
#recalc-box {
display: none;
max-height: 0;
overflow: hidden;
transition: max-height 0.5s ease;
}
#recalc-box.show {
display: flex;
max-height: 200px; /* или auto для динамического размера */
}

View file

@ -11,6 +11,7 @@ from .logger import logger
from .const import REDIS_WORKER_STATUS, REDIS_TRUSTED_LIST, REDIS_UNTRUSTED_LIST, REDIS_TASK_QUEUE_NAME, REDIS_DRAMATIQ_QUEUE
from .user import reset_user_pool
from .statistics import statistics
from .search import index_company
broker = dramatiq.get_broker()

View file

@ -22,7 +22,7 @@
<h1>{% block header %}{% endblock %}</h1>
</div>
<div class="search-container">
<form class="search-form" id="search-form" method="POST" action="/search">
<form class="search-form" id="search-form" method="GET" action="/search">
<input type="text" id="oid" name="query" value="{{query}}" placeholder="Введите название компании или 2GIS object_id" required>
<button type="submit">Найти</button>
</form>

View file

@ -114,10 +114,10 @@
</div>
<div id="actionbox">
<p>Можно пересчитать (на случай если в алгоритме поменяли параметры со времени расчета).</p>
<p>Можно <a href="javascript:void(0)" class="toggle" id="toggle-link">пересчитать</a> (на случай если в алгоритме поменяли параметры со времени расчета).</p>
<form method="POST" action="/submit">
<div class="flexdiv">
<div class="flexdiv" id="recalc-box">
{%if settings.turnstile_sitekey %}
{% set btn_disabled = "disabled" %}
@ -129,11 +129,11 @@
<div>
<input type="hidden" name="force" value="true">
<input type="hidden" name="oid" value="{{oid}}">
<button id="submit_btn" {{btn_disabled}}>Пересчитать</button>
<button type="submit" id="submit_btn" {{btn_disabled}}>Пересчитать</button>
</div>
</div>
</form>
</div>
</div> <!-- actionbox -->
</div>