From 691a8a1c40f8f1357c09e8f2ac885865bbad6a5e Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Tue, 30 Sep 2025 16:52:59 -0700 Subject: add new sqlite persistent storage creation and migration --- .gitignore | 4 +++- database.py | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ schema.sql | 49 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+), 1 deletion(-) create mode 100644 database.py create mode 100644 schema.sql diff --git a/.gitignore b/.gitignore index 9d7d2ab..9037223 100644 --- a/.gitignore +++ b/.gitignore @@ -173,4 +173,6 @@ cython_debug/ news tl_cache.json wac_result_cache.json -summarization_cache.json \ No newline at end of file +summarization_cache.json +*.bak +*.db diff --git a/database.py b/database.py new file mode 100644 index 0000000..a4e5ac9 --- /dev/null +++ b/database.py @@ -0,0 +1,68 @@ +import sqlite3 +import os +import json + +class Database: + def __init__(self): + self._conn = sqlite3.connect("news.db") + self._cursor = self._conn.cursor() + self._initialize_db() + self._migrate_old_data() + + def _initialize_db(self): + with open("schema.sql") as f: + self._cursor.executescript(f.read()) + self._conn.commit() + + def _migrate_old_data(self): + """ + Migrates old summarization, tl and wac files into DB + """ + if os.path.exists("summarization_cache.json"): + print("[Database] Migrating old summarization_cache to DB") + with open("summarization_cache.json", "r") as file: + summ_cache = json.load(file) + for key, val in summ_cache.items(): + self.add_new_summary(key, val["headline"], val["content"]) + os.rename("summarization_cache.json", "summarization_cache.json.bak") + + if os.path.exists("tl_cache.json"): + print("[Database] Migrating old translation cache (tl_cache.json) to DB") + with open("tl_cache.json", "r") as file: + tl_cache = json.load(file) + import hashlib + for entry in tl_cache: + key = hashlib.sha256((entry["source_lang"] + entry["target_lang"] + entry["source_txt"]).encode('utf-8')).hexdigest() + self.add_new_translation(key, entry["source_lang"], entry["target_lang"], entry["source_txt"], entry["result_txt"]) + os.rename("tl_cache.json", "tl_cache.json.bak") + + if os.path.exists("wac_result_cache.json"): + print("[Database] Migrating old WAC Data cache to DB") + with open("wac_result_cache.json", "r") as file: + wac_cache = json.load(file) + import hashlib + for key, value in wac_cache.items(): + self.add_new_wac_entry(key, value[0], value[1]) + os.rename("wac_result_cache.json", "wac_result_cache.json.bak") + + def add_new_wac_entry(self, key: str, is_news: bool, post_type: str): + news_var = 0 if is_news is False else 1 + self._cursor.execute( + "INSERT OR REPLACE INTO wacplus (id, isNews, type) VALUES (?, ?, ?)", + (key, news_var, post_type) + ) + self._conn.commit() + + def add_new_translation(self, key: str, source_lang: str, target_lang: str, source_txt: str, result_txt: str): + self._cursor.execute( + "INSERT OR REPLACE INTO translation (id, source_lang, target_lang, source, result) VALUES (?, ?, ?, ?, ?)", + (key, source_lang, target_lang, source_txt, result_txt) + ) + self._conn.commit() + + def add_new_summary(self, key: str, headline: str, content: str): + self._cursor.execute( + "INSERT OR REPLACE INTO summarization (id, headline, content) VALUES (?, ?, ?)", + (key, headline, content) + ) + self._conn.commit() diff --git a/schema.sql b/schema.sql new file mode 100644 index 0000000..5c2e2f0 --- /dev/null +++ b/schema.sql @@ -0,0 +1,49 @@ +CREATE TABLE IF NOT EXISTS news ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + date TEXT NOT NULL, + identifier TEXT NOT NULL, + type TEXT, + timestamp INTEGER NOT NULL, + headline TEXT, + content TEXT NOT NULL, + url TEXT, + is_ai_summary INTEGER NOT NULL DEFAULT 0, + en_headline TEXT, + en_content TEXT +); + +CREATE TABLE IF NOT EXISTS news_images ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + news_id INTEGER NOT NULL, + image_url TEXT NOT NULL, + link_url TEXT, + FOREIGN KEY (news_id) REFERENCES news(id) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS summarization ( + id VARCHAR(255) PRIMARY KEY, + headline TEXT NOT NULL, + content TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS translation ( + id VARCHAR(255) PRIMARY KEY, + source TEXT NOT NULL, + result TEXT NOT NULL, + source_lang TEXT NOT NULL, + target_lang TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS wacplus ( + id VARCHAR(255) PRIMARY KEY, + isNews INTEGER NOT NULL, + type TEXT NOT NULL +); + +-- Indexes for common queries +CREATE INDEX IF NOT EXISTS idx_news_date ON news(date); +CREATE INDEX IF NOT EXISTS idx_news_identifier ON news(identifier); +CREATE INDEX IF NOT EXISTS idx_news_timestamp ON news(timestamp); +CREATE INDEX IF NOT EXISTS idx_news_type ON news(type); +CREATE INDEX IF NOT EXISTS idx_news_is_ai_summary ON news(is_ai_summary); +CREATE INDEX IF NOT EXISTS idx_news_images_news_id ON news_images(news_id); -- cgit v1.2.3