diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-10-01 23:43:22 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-10-02 01:27:15 -0700 |
| commit | c69189dbf844842049ca8a511803da89b5d1d9e9 (patch) | |
| tree | 88784ff3d0f09ef9dfdbf94c919fd25f93ea4f99 | |
| parent | 63dd4995224db0540be1bca7a8f8d463483f8e43 (diff) | |
add news feed archival to db
| -rw-r--r-- | database.py | 93 | ||||
| -rw-r--r-- | generate.py | 20 | ||||
| -rw-r--r-- | middleware/package.json | 49 | ||||
| -rw-r--r-- | middleware/pnpm-lock.yaml | 15 | ||||
| -rw-r--r-- | schema.sql | 7 |
5 files changed, 115 insertions, 69 deletions
diff --git a/database.py b/database.py index 0e1da81..2f6ae9a 100644 --- a/database.py +++ b/database.py @@ -1,6 +1,7 @@ -import sqlite3 -import os import json +import os +import sqlite3 + class Database: def __init__(self): @@ -36,9 +37,22 @@ class Database: with open("tl_cache.json", "r") as file: tl_cache = json.load(file) import hashlib + for entry in tl_cache: - key = hashlib.sha256((entry["source_lang"] + entry["target_lang"] + entry["source_txt"]).encode('utf-8')).hexdigest() - self.add_new_translation(key, entry["source_lang"], entry["target_lang"], entry["source_txt"], entry["result_txt"]) + key = hashlib.sha256( + ( + entry["source_lang"] + + entry["target_lang"] + + entry["source_txt"] + ).encode("utf-8") + ).hexdigest() + self.add_new_translation( + key, + entry["source_lang"], + entry["target_lang"], + entry["source_txt"], + entry["result_txt"], + ) os.rename("tl_cache.json", "tl_cache.json.bak") if os.path.exists("wac_result_cache.json"): @@ -46,6 +60,7 @@ class Database: with open("wac_result_cache.json", "r") as file: wac_cache = json.load(file) import hashlib + for key, value in wac_cache.items(): self.add_new_wac_entry(key, value[0], value[1]) os.rename("wac_result_cache.json", "wac_result_cache.json.bak") @@ -53,29 +68,65 @@ class Database: def add_new_wac_entry(self, key: str, is_news: bool, post_type: str): news_var = 0 if is_news is False else 1 self._cursor.execute( - "INSERT OR REPLACE INTO wacplus (id, isNews, type) VALUES (?, ?, ?)", - (key, news_var, post_type) - ) + "INSERT OR REPLACE INTO wacplus (id, isNews, type) VALUES (?, ?, ?)", + (key, news_var, post_type), + ) self._conn.commit() - def add_new_translation(self, key: str, source_lang: str, target_lang: str, source_txt: str, result_txt: str): + def add_new_translation( + self, + key: str, + source_lang: str, + target_lang: str, + source_txt: str, + result_txt: str, + ): self._cursor.execute( - "INSERT OR REPLACE INTO translation (id, source_lang, target_lang, source, result) VALUES (?, ?, ?, ?, ?)", - (key, source_lang, target_lang, source_txt, result_txt) - ) + "INSERT OR REPLACE INTO translation (id, source_lang, target_lang, source, result) VALUES (?, ?, ?, ?, ?)", + (key, source_lang, target_lang, source_txt, result_txt), + ) self._conn.commit() def add_new_summary(self, key: str, headline: str, content: str): self._cursor.execute( - "INSERT OR REPLACE INTO summarization (id, headline, content) VALUES (?, ?, ?)", - (key, headline, content) - ) + "INSERT OR REPLACE INTO summarization (id, headline, content) VALUES (?, ?, ?)", + (key, headline, content), + ) self._conn.commit() + def add_news_entry(self, key: str, news_entry: dict): + is_ai_summary = 1 if news_entry.get("is_ai_summary", False) else 0 + en_headline = news_entry.get("en_headline", None) + en_content = news_entry.get("en_content", None) + headline = news_entry.get("headline", None) + url = news_entry.get("url", None) + self._cursor.execute( + "INSERT OR REPLACE INTO news (news_id, date, identifier, type, timestamp, headline, content, url, is_ai_summary, en_headline, en_content) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", + ( + key, + news_entry["date"], + news_entry["identifier"], + news_entry["type"], + news_entry["timestamp"], + headline, + news_entry["content"], + url, + is_ai_summary, + en_headline, + en_content, + ), + ) + for image_entry in news_entry["images"]: + self._cursor.execute( + "INSERT OR REPLACE INTO news_images (news_id, image_url, link_url) VALUES (?, ?, ?)", + (key, image_entry["image"], image_entry["link"]), + ) + self._conn.commit() + + def get_summary(self, key: str): self._cursor.execute( - "SELECT headline, content FROM summarization WHERE id = ?", - (key,) + "SELECT headline, content FROM summarization WHERE id = ?", (key,) ) result = self._cursor.fetchone() if result is None: @@ -83,20 +134,14 @@ class Database: return {"headline": result[0], "content": result[1]} def get_translation(self, key: str): - self._cursor.execute( - "SELECT result FROM translation WHERE id = ?", - (key,) - ) + self._cursor.execute("SELECT result FROM translation WHERE id = ?", (key,)) result = self._cursor.fetchone() if result is None: return None return result[0] def get_wac_entry(self, key: str): - self._cursor.execute( - "SELECT isNews, type FROM wacplus WHERE id = ?", - (key,) - ) + self._cursor.execute("SELECT isNews, type FROM wacplus WHERE id = ?", (key,)) result = self._cursor.fetchone() if result is None: return None diff --git a/generate.py b/generate.py index fac8c57..3611100 100644 --- a/generate.py +++ b/generate.py @@ -10,15 +10,25 @@ import hashlib import os from dotenv import load_dotenv from datetime import datetime, timedelta +from database import Database load_dotenv() OUTPUT_DIR = "news" +ARCHIVE_NEWS = True def compute_json_hash(data): return hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8')).hexdigest() +def save_news_to_db(news_feed: list): + log_output("Writing news to local save database. This is purely for archival reasons") + database = Database() + for entry in news_feed: + key = compute_json_hash(entry) + database.add_news_entry(key, entry) + database.close() + def create_merged_feed(*news_lists, limit=constants.DAYS_LIMIT): """ Generator-based memory-efficient merging of multiple news feeds. @@ -56,7 +66,13 @@ def log_output(message: str, type: str="DEBUG"): def generate_news_file(filename, url, version=None): log_output(f"Fetching {filename.upper()} News Data", "NEWS") - news_data = feed.get_news(url, version) if version else feed.get_news(url) + news_data = None + try: + news_data = feed.get_news(url, version) if version else feed.get_news(url) + except Exception as e: + print(e) + print("[ERROR] Wasn't able to fetch news. Skipping...") + path = f"{OUTPUT_DIR}/{filename}.json" if news_data: log_output(f"Success. Got {filename.upper()} News Data. Saving to file...", "NEWS") @@ -207,6 +223,8 @@ if __name__ == "__main__": wmmt_news ) log_output("Creating merged news.json file for all news that are within " + str(constants.DAYS_LIMIT) + " days old") + if ARCHIVE_NEWS: + save_news_to_db(news) with open(OUTPUT_DIR+'/news.json', 'w') as json_file: json.dump(attach_news_meta_data(news), json_file) log_output("JOB DONE", "TASK") diff --git a/middleware/package.json b/middleware/package.json index 288c55b..a4446e3 100644 --- a/middleware/package.json +++ b/middleware/package.json @@ -1,27 +1,26 @@ { - "name": "og-cron", - "version": "0.1.0", - "private": true, - "scripts": { - "dev": "next dev", - "build": "next build", - "start": "next start", - "lint": "next lint" - }, - "dependencies": { - "@next/font": "14.2.15", - "@types/node": "24.0.8", - "@types/react": "19.1.8", - "@types/react-dom": "19.1.6", - "@upstash/redis": "^1.35.0", - "@vercel/analytics": "^1.5.0", - "@vercel/og": "^0.6.8", - "eslint": "9.30.0", - "eslint-config-next": "15.3.4", - "global": "^4.4.0", - "next": "^15.3.4", - "react": "19.1.0", - "react-dom": "19.1.0", - "typescript": "5.8.3" - } + "name": "og-cron", + "version": "0.1.0", + "private": true, + "scripts": { + "dev": "next dev", + "build": "next build", + "start": "next start", + "lint": "next lint" + }, + "dependencies": { + "@next/font": "14.2.15", + "@types/node": "24.0.8", + "@types/react": "19.1.8", + "@types/react-dom": "19.1.6", + "@vercel/analytics": "^1.5.0", + "@vercel/og": "^0.6.8", + "eslint": "9.30.0", + "eslint-config-next": "15.3.4", + "global": "^4.4.0", + "next": "^15.3.4", + "react": "19.1.0", + "react-dom": "19.1.0", + "typescript": "5.8.3" + } } diff --git a/middleware/pnpm-lock.yaml b/middleware/pnpm-lock.yaml index 83f2e87..99b252e 100644 --- a/middleware/pnpm-lock.yaml +++ b/middleware/pnpm-lock.yaml @@ -20,9 +20,6 @@ importers: '@types/react-dom': specifier: 19.1.6 version: 19.1.6(@types/react@19.1.8) - '@upstash/redis': - specifier: ^1.35.0 - version: 1.35.0 '@vercel/analytics': specifier: ^1.5.0 version: 1.5.0(next@15.3.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0))(react@19.1.0) @@ -516,9 +513,6 @@ packages: cpu: [x64] os: [win32] - '@upstash/redis@1.35.0': - resolution: {integrity: sha512-WUm0Jz1xN4DBDGeJIi2Y0kVsolWRB2tsVds4SExaiLg4wBdHFMB+8IfZtBWr+BP0FvhuBr5G1/VLrJ9xzIWHsg==} - '@vercel/analytics@1.5.0': resolution: {integrity: sha512-MYsBzfPki4gthY5HnYN7jgInhAZ7Ac1cYDoRWFomwGHWEX7odTEzbtg9kf/QSo7XEsEAqlQugA6gJ2WS2DEa3g==} peerDependencies: @@ -1665,9 +1659,6 @@ packages: resolution: {integrity: sha512-nWJ91DjeOkej/TA8pXQ3myruKpKEYgqvpw9lz4OPHj/NWFNluYrjbz9j01CJ8yKQd2g4jFoOkINCTW2I5LEEyw==} engines: {node: '>= 0.4'} - uncrypto@0.1.3: - resolution: {integrity: sha512-Ql87qFHB3s/De2ClA9e0gsnS6zXG27SkTiSJwjCc9MebbfapQfuPzumMIUMi38ezPZVNFcHI9sUIepeQfw8J8Q==} - undici-types@7.8.0: resolution: {integrity: sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==} @@ -2118,10 +2109,6 @@ snapshots: '@unrs/resolver-binding-win32-x64-msvc@1.9.2': optional: true - '@upstash/redis@1.35.0': - dependencies: - uncrypto: 0.1.3 - '@vercel/analytics@1.5.0(next@15.3.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0))(react@19.1.0)': optionalDependencies: next: 15.3.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0) @@ -3525,8 +3512,6 @@ snapshots: has-symbols: 1.1.0 which-boxed-primitive: 1.1.1 - uncrypto@0.1.3: {} - undici-types@7.8.0: {} unicode-trie@2.0.0: @@ -1,5 +1,5 @@ CREATE TABLE IF NOT EXISTS news ( - id INTEGER PRIMARY KEY AUTOINCREMENT, + news_id VARCHAR(255) NOT NULL PRIMARY KEY, date TEXT NOT NULL, identifier TEXT NOT NULL, type TEXT, @@ -13,11 +13,10 @@ CREATE TABLE IF NOT EXISTS news ( ); CREATE TABLE IF NOT EXISTS news_images ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - news_id INTEGER NOT NULL, + news_id VARCHAR(255) NOT NULL PRIMARY KEY, image_url TEXT NOT NULL, link_url TEXT, - FOREIGN KEY (news_id) REFERENCES news(id) ON DELETE CASCADE + FOREIGN KEY (news_id) REFERENCES news(news_id) ON DELETE CASCADE ); CREATE TABLE IF NOT EXISTS summarization ( |
