diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-09-30 16:52:59 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-09-30 16:53:19 -0700 |
| commit | 691a8a1c40f8f1357c09e8f2ac885865bbad6a5e (patch) | |
| tree | a16c32496afdefc1f159811cfde80849b8a97a40 | |
| parent | e81d5213da85cad3bd1b8bb5b10e3871c03f6ba9 (diff) | |
add new sqlite persistent storage creation and migration
| -rw-r--r-- | .gitignore | 4 | ||||
| -rw-r--r-- | database.py | 68 | ||||
| -rw-r--r-- | schema.sql | 49 |
3 files changed, 120 insertions, 1 deletions
@@ -173,4 +173,6 @@ cython_debug/ news tl_cache.json wac_result_cache.json -summarization_cache.json
\ No newline at end of file +summarization_cache.json +*.bak +*.db diff --git a/database.py b/database.py new file mode 100644 index 0000000..a4e5ac9 --- /dev/null +++ b/database.py @@ -0,0 +1,68 @@ +import sqlite3 +import os +import json + +class Database: + def __init__(self): + self._conn = sqlite3.connect("news.db") + self._cursor = self._conn.cursor() + self._initialize_db() + self._migrate_old_data() + + def _initialize_db(self): + with open("schema.sql") as f: + self._cursor.executescript(f.read()) + self._conn.commit() + + def _migrate_old_data(self): + """ + Migrates old summarization, tl and wac files into DB + """ + if os.path.exists("summarization_cache.json"): + print("[Database] Migrating old summarization_cache to DB") + with open("summarization_cache.json", "r") as file: + summ_cache = json.load(file) + for key, val in summ_cache.items(): + self.add_new_summary(key, val["headline"], val["content"]) + os.rename("summarization_cache.json", "summarization_cache.json.bak") + + if os.path.exists("tl_cache.json"): + print("[Database] Migrating old translation cache (tl_cache.json) to DB") + with open("tl_cache.json", "r") as file: + tl_cache = json.load(file) + import hashlib + for entry in tl_cache: + key = hashlib.sha256((entry["source_lang"] + entry["target_lang"] + entry["source_txt"]).encode('utf-8')).hexdigest() + self.add_new_translation(key, entry["source_lang"], entry["target_lang"], entry["source_txt"], entry["result_txt"]) + os.rename("tl_cache.json", "tl_cache.json.bak") + + if os.path.exists("wac_result_cache.json"): + print("[Database] Migrating old WAC Data cache to DB") + with open("wac_result_cache.json", "r") as file: + wac_cache = json.load(file) + import hashlib + for key, value in wac_cache.items(): + self.add_new_wac_entry(key, value[0], value[1]) + os.rename("wac_result_cache.json", "wac_result_cache.json.bak") + + def add_new_wac_entry(self, key: str, is_news: bool, post_type: str): + news_var = 0 if is_news is False else 1 + self._cursor.execute( + "INSERT OR REPLACE INTO wacplus (id, isNews, type) VALUES (?, ?, ?)", + (key, news_var, post_type) + ) + self._conn.commit() + + def add_new_translation(self, key: str, source_lang: str, target_lang: str, source_txt: str, result_txt: str): + self._cursor.execute( + "INSERT OR REPLACE INTO translation (id, source_lang, target_lang, source, result) VALUES (?, ?, ?, ?, ?)", + (key, source_lang, target_lang, source_txt, result_txt) + ) + self._conn.commit() + + def add_new_summary(self, key: str, headline: str, content: str): + self._cursor.execute( + "INSERT OR REPLACE INTO summarization (id, headline, content) VALUES (?, ?, ?)", + (key, headline, content) + ) + self._conn.commit() diff --git a/schema.sql b/schema.sql new file mode 100644 index 0000000..5c2e2f0 --- /dev/null +++ b/schema.sql @@ -0,0 +1,49 @@ +CREATE TABLE IF NOT EXISTS news ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + date TEXT NOT NULL, + identifier TEXT NOT NULL, + type TEXT, + timestamp INTEGER NOT NULL, + headline TEXT, + content TEXT NOT NULL, + url TEXT, + is_ai_summary INTEGER NOT NULL DEFAULT 0, + en_headline TEXT, + en_content TEXT +); + +CREATE TABLE IF NOT EXISTS news_images ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + news_id INTEGER NOT NULL, + image_url TEXT NOT NULL, + link_url TEXT, + FOREIGN KEY (news_id) REFERENCES news(id) ON DELETE CASCADE +); + +CREATE TABLE IF NOT EXISTS summarization ( + id VARCHAR(255) PRIMARY KEY, + headline TEXT NOT NULL, + content TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS translation ( + id VARCHAR(255) PRIMARY KEY, + source TEXT NOT NULL, + result TEXT NOT NULL, + source_lang TEXT NOT NULL, + target_lang TEXT NOT NULL +); + +CREATE TABLE IF NOT EXISTS wacplus ( + id VARCHAR(255) PRIMARY KEY, + isNews INTEGER NOT NULL, + type TEXT NOT NULL +); + +-- Indexes for common queries +CREATE INDEX IF NOT EXISTS idx_news_date ON news(date); +CREATE INDEX IF NOT EXISTS idx_news_identifier ON news(identifier); +CREATE INDEX IF NOT EXISTS idx_news_timestamp ON news(timestamp); +CREATE INDEX IF NOT EXISTS idx_news_type ON news(type); +CREATE INDEX IF NOT EXISTS idx_news_is_ai_summary ON news(is_ai_summary); +CREATE INDEX IF NOT EXISTS idx_news_images_news_id ON news_images(news_id); |
