aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2025-09-30 16:52:59 -0700
committerPinapelz <yukais@pinapelz.com>2025-09-30 16:53:19 -0700
commit691a8a1c40f8f1357c09e8f2ac885865bbad6a5e (patch)
treea16c32496afdefc1f159811cfde80849b8a97a40
parente81d5213da85cad3bd1b8bb5b10e3871c03f6ba9 (diff)
add new sqlite persistent storage creation and migration
-rw-r--r--.gitignore4
-rw-r--r--database.py68
-rw-r--r--schema.sql49
3 files changed, 120 insertions, 1 deletions
diff --git a/.gitignore b/.gitignore
index 9d7d2ab..9037223 100644
--- a/.gitignore
+++ b/.gitignore
@@ -173,4 +173,6 @@ cython_debug/
news
tl_cache.json
wac_result_cache.json
-summarization_cache.json \ No newline at end of file
+summarization_cache.json
+*.bak
+*.db
diff --git a/database.py b/database.py
new file mode 100644
index 0000000..a4e5ac9
--- /dev/null
+++ b/database.py
@@ -0,0 +1,68 @@
+import sqlite3
+import os
+import json
+
+class Database:
+ def __init__(self):
+ self._conn = sqlite3.connect("news.db")
+ self._cursor = self._conn.cursor()
+ self._initialize_db()
+ self._migrate_old_data()
+
+ def _initialize_db(self):
+ with open("schema.sql") as f:
+ self._cursor.executescript(f.read())
+ self._conn.commit()
+
+ def _migrate_old_data(self):
+ """
+ Migrates old summarization, tl and wac files into DB
+ """
+ if os.path.exists("summarization_cache.json"):
+ print("[Database] Migrating old summarization_cache to DB")
+ with open("summarization_cache.json", "r") as file:
+ summ_cache = json.load(file)
+ for key, val in summ_cache.items():
+ self.add_new_summary(key, val["headline"], val["content"])
+ os.rename("summarization_cache.json", "summarization_cache.json.bak")
+
+ if os.path.exists("tl_cache.json"):
+ print("[Database] Migrating old translation cache (tl_cache.json) to DB")
+ with open("tl_cache.json", "r") as file:
+ tl_cache = json.load(file)
+ import hashlib
+ for entry in tl_cache:
+ key = hashlib.sha256((entry["source_lang"] + entry["target_lang"] + entry["source_txt"]).encode('utf-8')).hexdigest()
+ self.add_new_translation(key, entry["source_lang"], entry["target_lang"], entry["source_txt"], entry["result_txt"])
+ os.rename("tl_cache.json", "tl_cache.json.bak")
+
+ if os.path.exists("wac_result_cache.json"):
+ print("[Database] Migrating old WAC Data cache to DB")
+ with open("wac_result_cache.json", "r") as file:
+ wac_cache = json.load(file)
+ import hashlib
+ for key, value in wac_cache.items():
+ self.add_new_wac_entry(key, value[0], value[1])
+ os.rename("wac_result_cache.json", "wac_result_cache.json.bak")
+
+ def add_new_wac_entry(self, key: str, is_news: bool, post_type: str):
+ news_var = 0 if is_news is False else 1
+ self._cursor.execute(
+ "INSERT OR REPLACE INTO wacplus (id, isNews, type) VALUES (?, ?, ?)",
+ (key, news_var, post_type)
+ )
+ self._conn.commit()
+
+ def add_new_translation(self, key: str, source_lang: str, target_lang: str, source_txt: str, result_txt: str):
+ self._cursor.execute(
+ "INSERT OR REPLACE INTO translation (id, source_lang, target_lang, source, result) VALUES (?, ?, ?, ?, ?)",
+ (key, source_lang, target_lang, source_txt, result_txt)
+ )
+ self._conn.commit()
+
+ def add_new_summary(self, key: str, headline: str, content: str):
+ self._cursor.execute(
+ "INSERT OR REPLACE INTO summarization (id, headline, content) VALUES (?, ?, ?)",
+ (key, headline, content)
+ )
+ self._conn.commit()
diff --git a/schema.sql b/schema.sql
new file mode 100644
index 0000000..5c2e2f0
--- /dev/null
+++ b/schema.sql
@@ -0,0 +1,49 @@
+CREATE TABLE IF NOT EXISTS news (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ date TEXT NOT NULL,
+ identifier TEXT NOT NULL,
+ type TEXT,
+ timestamp INTEGER NOT NULL,
+ headline TEXT,
+ content TEXT NOT NULL,
+ url TEXT,
+ is_ai_summary INTEGER NOT NULL DEFAULT 0,
+ en_headline TEXT,
+ en_content TEXT
+);
+
+CREATE TABLE IF NOT EXISTS news_images (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ news_id INTEGER NOT NULL,
+ image_url TEXT NOT NULL,
+ link_url TEXT,
+ FOREIGN KEY (news_id) REFERENCES news(id) ON DELETE CASCADE
+);
+
+CREATE TABLE IF NOT EXISTS summarization (
+ id VARCHAR(255) PRIMARY KEY,
+ headline TEXT NOT NULL,
+ content TEXT NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS translation (
+ id VARCHAR(255) PRIMARY KEY,
+ source TEXT NOT NULL,
+ result TEXT NOT NULL,
+ source_lang TEXT NOT NULL,
+ target_lang TEXT NOT NULL
+);
+
+CREATE TABLE IF NOT EXISTS wacplus (
+ id VARCHAR(255) PRIMARY KEY,
+ isNews INTEGER NOT NULL,
+ type TEXT NOT NULL
+);
+
+-- Indexes for common queries
+CREATE INDEX IF NOT EXISTS idx_news_date ON news(date);
+CREATE INDEX IF NOT EXISTS idx_news_identifier ON news(identifier);
+CREATE INDEX IF NOT EXISTS idx_news_timestamp ON news(timestamp);
+CREATE INDEX IF NOT EXISTS idx_news_type ON news(type);
+CREATE INDEX IF NOT EXISTS idx_news_is_ai_summary ON news(is_ai_summary);
+CREATE INDEX IF NOT EXISTS idx_news_images_news_id ON news_images(news_id);
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage