aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2025-10-01 23:43:22 -0700
committerPinapelz <yukais@pinapelz.com>2025-10-02 01:27:15 -0700
commitc69189dbf844842049ca8a511803da89b5d1d9e9 (patch)
tree88784ff3d0f09ef9dfdbf94c919fd25f93ea4f99
parent63dd4995224db0540be1bca7a8f8d463483f8e43 (diff)
add news feed archival to db
-rw-r--r--database.py93
-rw-r--r--generate.py20
-rw-r--r--middleware/package.json49
-rw-r--r--middleware/pnpm-lock.yaml15
-rw-r--r--schema.sql7
5 files changed, 115 insertions, 69 deletions
diff --git a/database.py b/database.py
index 0e1da81..2f6ae9a 100644
--- a/database.py
+++ b/database.py
@@ -1,6 +1,7 @@
-import sqlite3
-import os
import json
+import os
+import sqlite3
+
class Database:
def __init__(self):
@@ -36,9 +37,22 @@ class Database:
with open("tl_cache.json", "r") as file:
tl_cache = json.load(file)
import hashlib
+
for entry in tl_cache:
- key = hashlib.sha256((entry["source_lang"] + entry["target_lang"] + entry["source_txt"]).encode('utf-8')).hexdigest()
- self.add_new_translation(key, entry["source_lang"], entry["target_lang"], entry["source_txt"], entry["result_txt"])
+ key = hashlib.sha256(
+ (
+ entry["source_lang"]
+ + entry["target_lang"]
+ + entry["source_txt"]
+ ).encode("utf-8")
+ ).hexdigest()
+ self.add_new_translation(
+ key,
+ entry["source_lang"],
+ entry["target_lang"],
+ entry["source_txt"],
+ entry["result_txt"],
+ )
os.rename("tl_cache.json", "tl_cache.json.bak")
if os.path.exists("wac_result_cache.json"):
@@ -46,6 +60,7 @@ class Database:
with open("wac_result_cache.json", "r") as file:
wac_cache = json.load(file)
import hashlib
+
for key, value in wac_cache.items():
self.add_new_wac_entry(key, value[0], value[1])
os.rename("wac_result_cache.json", "wac_result_cache.json.bak")
@@ -53,29 +68,65 @@ class Database:
def add_new_wac_entry(self, key: str, is_news: bool, post_type: str):
news_var = 0 if is_news is False else 1
self._cursor.execute(
- "INSERT OR REPLACE INTO wacplus (id, isNews, type) VALUES (?, ?, ?)",
- (key, news_var, post_type)
- )
+ "INSERT OR REPLACE INTO wacplus (id, isNews, type) VALUES (?, ?, ?)",
+ (key, news_var, post_type),
+ )
self._conn.commit()
- def add_new_translation(self, key: str, source_lang: str, target_lang: str, source_txt: str, result_txt: str):
+ def add_new_translation(
+ self,
+ key: str,
+ source_lang: str,
+ target_lang: str,
+ source_txt: str,
+ result_txt: str,
+ ):
self._cursor.execute(
- "INSERT OR REPLACE INTO translation (id, source_lang, target_lang, source, result) VALUES (?, ?, ?, ?, ?)",
- (key, source_lang, target_lang, source_txt, result_txt)
- )
+ "INSERT OR REPLACE INTO translation (id, source_lang, target_lang, source, result) VALUES (?, ?, ?, ?, ?)",
+ (key, source_lang, target_lang, source_txt, result_txt),
+ )
self._conn.commit()
def add_new_summary(self, key: str, headline: str, content: str):
self._cursor.execute(
- "INSERT OR REPLACE INTO summarization (id, headline, content) VALUES (?, ?, ?)",
- (key, headline, content)
- )
+ "INSERT OR REPLACE INTO summarization (id, headline, content) VALUES (?, ?, ?)",
+ (key, headline, content),
+ )
self._conn.commit()
+ def add_news_entry(self, key: str, news_entry: dict):
+ is_ai_summary = 1 if news_entry.get("is_ai_summary", False) else 0
+ en_headline = news_entry.get("en_headline", None)
+ en_content = news_entry.get("en_content", None)
+ headline = news_entry.get("headline", None)
+ url = news_entry.get("url", None)
+ self._cursor.execute(
+ "INSERT OR REPLACE INTO news (news_id, date, identifier, type, timestamp, headline, content, url, is_ai_summary, en_headline, en_content) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
+ (
+ key,
+ news_entry["date"],
+ news_entry["identifier"],
+ news_entry["type"],
+ news_entry["timestamp"],
+ headline,
+ news_entry["content"],
+ url,
+ is_ai_summary,
+ en_headline,
+ en_content,
+ ),
+ )
+ for image_entry in news_entry["images"]:
+ self._cursor.execute(
+ "INSERT OR REPLACE INTO news_images (news_id, image_url, link_url) VALUES (?, ?, ?)",
+ (key, image_entry["image"], image_entry["link"]),
+ )
+ self._conn.commit()
+
+
def get_summary(self, key: str):
self._cursor.execute(
- "SELECT headline, content FROM summarization WHERE id = ?",
- (key,)
+ "SELECT headline, content FROM summarization WHERE id = ?", (key,)
)
result = self._cursor.fetchone()
if result is None:
@@ -83,20 +134,14 @@ class Database:
return {"headline": result[0], "content": result[1]}
def get_translation(self, key: str):
- self._cursor.execute(
- "SELECT result FROM translation WHERE id = ?",
- (key,)
- )
+ self._cursor.execute("SELECT result FROM translation WHERE id = ?", (key,))
result = self._cursor.fetchone()
if result is None:
return None
return result[0]
def get_wac_entry(self, key: str):
- self._cursor.execute(
- "SELECT isNews, type FROM wacplus WHERE id = ?",
- (key,)
- )
+ self._cursor.execute("SELECT isNews, type FROM wacplus WHERE id = ?", (key,))
result = self._cursor.fetchone()
if result is None:
return None
diff --git a/generate.py b/generate.py
index fac8c57..3611100 100644
--- a/generate.py
+++ b/generate.py
@@ -10,15 +10,25 @@ import hashlib
import os
from dotenv import load_dotenv
from datetime import datetime, timedelta
+from database import Database
load_dotenv()
OUTPUT_DIR = "news"
+ARCHIVE_NEWS = True
def compute_json_hash(data):
return hashlib.sha256(json.dumps(data, sort_keys=True).encode('utf-8')).hexdigest()
+def save_news_to_db(news_feed: list):
+ log_output("Writing news to local save database. This is purely for archival reasons")
+ database = Database()
+ for entry in news_feed:
+ key = compute_json_hash(entry)
+ database.add_news_entry(key, entry)
+ database.close()
+
def create_merged_feed(*news_lists, limit=constants.DAYS_LIMIT):
"""
Generator-based memory-efficient merging of multiple news feeds.
@@ -56,7 +66,13 @@ def log_output(message: str, type: str="DEBUG"):
def generate_news_file(filename, url, version=None):
log_output(f"Fetching {filename.upper()} News Data", "NEWS")
- news_data = feed.get_news(url, version) if version else feed.get_news(url)
+ news_data = None
+ try:
+ news_data = feed.get_news(url, version) if version else feed.get_news(url)
+ except Exception as e:
+ print(e)
+ print("[ERROR] Wasn't able to fetch news. Skipping...")
+
path = f"{OUTPUT_DIR}/{filename}.json"
if news_data:
log_output(f"Success. Got {filename.upper()} News Data. Saving to file...", "NEWS")
@@ -207,6 +223,8 @@ if __name__ == "__main__":
wmmt_news
)
log_output("Creating merged news.json file for all news that are within " + str(constants.DAYS_LIMIT) + " days old")
+ if ARCHIVE_NEWS:
+ save_news_to_db(news)
with open(OUTPUT_DIR+'/news.json', 'w') as json_file:
json.dump(attach_news_meta_data(news), json_file)
log_output("JOB DONE", "TASK")
diff --git a/middleware/package.json b/middleware/package.json
index 288c55b..a4446e3 100644
--- a/middleware/package.json
+++ b/middleware/package.json
@@ -1,27 +1,26 @@
{
- "name": "og-cron",
- "version": "0.1.0",
- "private": true,
- "scripts": {
- "dev": "next dev",
- "build": "next build",
- "start": "next start",
- "lint": "next lint"
- },
- "dependencies": {
- "@next/font": "14.2.15",
- "@types/node": "24.0.8",
- "@types/react": "19.1.8",
- "@types/react-dom": "19.1.6",
- "@upstash/redis": "^1.35.0",
- "@vercel/analytics": "^1.5.0",
- "@vercel/og": "^0.6.8",
- "eslint": "9.30.0",
- "eslint-config-next": "15.3.4",
- "global": "^4.4.0",
- "next": "^15.3.4",
- "react": "19.1.0",
- "react-dom": "19.1.0",
- "typescript": "5.8.3"
- }
+ "name": "og-cron",
+ "version": "0.1.0",
+ "private": true,
+ "scripts": {
+ "dev": "next dev",
+ "build": "next build",
+ "start": "next start",
+ "lint": "next lint"
+ },
+ "dependencies": {
+ "@next/font": "14.2.15",
+ "@types/node": "24.0.8",
+ "@types/react": "19.1.8",
+ "@types/react-dom": "19.1.6",
+ "@vercel/analytics": "^1.5.0",
+ "@vercel/og": "^0.6.8",
+ "eslint": "9.30.0",
+ "eslint-config-next": "15.3.4",
+ "global": "^4.4.0",
+ "next": "^15.3.4",
+ "react": "19.1.0",
+ "react-dom": "19.1.0",
+ "typescript": "5.8.3"
+ }
}
diff --git a/middleware/pnpm-lock.yaml b/middleware/pnpm-lock.yaml
index 83f2e87..99b252e 100644
--- a/middleware/pnpm-lock.yaml
+++ b/middleware/pnpm-lock.yaml
@@ -20,9 +20,6 @@ importers:
'@types/react-dom':
specifier: 19.1.6
version: 19.1.6(@types/react@19.1.8)
- '@upstash/redis':
- specifier: ^1.35.0
- version: 1.35.0
'@vercel/analytics':
specifier: ^1.5.0
version: 1.5.0(next@15.3.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0))(react@19.1.0)
@@ -516,9 +513,6 @@ packages:
cpu: [x64]
os: [win32]
- '@upstash/redis@1.35.0':
- resolution: {integrity: sha512-WUm0Jz1xN4DBDGeJIi2Y0kVsolWRB2tsVds4SExaiLg4wBdHFMB+8IfZtBWr+BP0FvhuBr5G1/VLrJ9xzIWHsg==}
-
'@vercel/analytics@1.5.0':
resolution: {integrity: sha512-MYsBzfPki4gthY5HnYN7jgInhAZ7Ac1cYDoRWFomwGHWEX7odTEzbtg9kf/QSo7XEsEAqlQugA6gJ2WS2DEa3g==}
peerDependencies:
@@ -1665,9 +1659,6 @@ packages:
resolution: {integrity: sha512-nWJ91DjeOkej/TA8pXQ3myruKpKEYgqvpw9lz4OPHj/NWFNluYrjbz9j01CJ8yKQd2g4jFoOkINCTW2I5LEEyw==}
engines: {node: '>= 0.4'}
- uncrypto@0.1.3:
- resolution: {integrity: sha512-Ql87qFHB3s/De2ClA9e0gsnS6zXG27SkTiSJwjCc9MebbfapQfuPzumMIUMi38ezPZVNFcHI9sUIepeQfw8J8Q==}
-
undici-types@7.8.0:
resolution: {integrity: sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==}
@@ -2118,10 +2109,6 @@ snapshots:
'@unrs/resolver-binding-win32-x64-msvc@1.9.2':
optional: true
- '@upstash/redis@1.35.0':
- dependencies:
- uncrypto: 0.1.3
-
'@vercel/analytics@1.5.0(next@15.3.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0))(react@19.1.0)':
optionalDependencies:
next: 15.3.4(react-dom@19.1.0(react@19.1.0))(react@19.1.0)
@@ -3525,8 +3512,6 @@ snapshots:
has-symbols: 1.1.0
which-boxed-primitive: 1.1.1
- uncrypto@0.1.3: {}
-
undici-types@7.8.0: {}
unicode-trie@2.0.0:
diff --git a/schema.sql b/schema.sql
index 5c2e2f0..88e1120 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,5 +1,5 @@
CREATE TABLE IF NOT EXISTS news (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
+ news_id VARCHAR(255) NOT NULL PRIMARY KEY,
date TEXT NOT NULL,
identifier TEXT NOT NULL,
type TEXT,
@@ -13,11 +13,10 @@ CREATE TABLE IF NOT EXISTS news (
);
CREATE TABLE IF NOT EXISTS news_images (
- id INTEGER PRIMARY KEY AUTOINCREMENT,
- news_id INTEGER NOT NULL,
+ news_id VARCHAR(255) NOT NULL PRIMARY KEY,
image_url TEXT NOT NULL,
link_url TEXT,
- FOREIGN KEY (news_id) REFERENCES news(id) ON DELETE CASCADE
+ FOREIGN KEY (news_id) REFERENCES news(news_id) ON DELETE CASCADE
);
CREATE TABLE IF NOT EXISTS summarization (
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage