add recovery script for filling backlog of missing datapointsHEAD master

author: Pinapelz <yukais@pinapelz.com> 2025-08-21 15:14:07 -0700
committer: Pinapelz <yukais@pinapelz.com> 2025-08-21 15:14:26 -0700
commit: faaecf3b755f1e64be7baee214641ef540e7e57c (patch)
tree: dcec1035ceb0c970efb3899cc999776bce672d33 /recovery
parent: e14a40cec09100feceb49487508678a8b0e05d9d (diff)
2 files changed, 312 insertions, 0 deletions
diff --git a/recovery/README.txt b/recovery/README.txt
new file mode 100644
index 0000000..fa94451
--- /dev/null
+++ b/recovery/README.txt
@@ -0,0 +1,7 @@
+This tool is for filling in previously missing data, generally it should not be used unless you are missing specific data points.
+
+The necessary deps already exist in `requirements.txt` in the root directory
+
+Run this from the root directory.
+
+python recovery/recovery.py
diff --git a/recovery/recover.py b/recovery/recover.py
new file mode 100644
index 0000000..fb40281
--- /dev/null
+++ b/recovery/recover.py
@@ -0,0 +1,305 @@
+from dotenv import load_dotenv
+import json
+import requests
+import sys
+import os
+import time
+import argparse
+from datetime import datetime, timedelta
+import re
+from bs4 import BeautifulSoup
+sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
+from sql.pg_handler import PostgresHandler
+load_dotenv()
+
+
+def download_site_as_html(url: str, timeout: int = 10, response_encoding=None) -> str:
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/122.0.0.0 Safari/537.36"
+        ),
+        "Accept": (
+            "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8"
+        ),
+        "Accept-Language": "en-US,en;q=0.9",
+        "Connection": "keep-alive",
+    }
+
+    try:
+        response = requests.get(url, headers=headers, timeout=timeout)
+        if response_encoding:
+            response.encoding = response_encoding
+        response.raise_for_status()
+        return response.text
+    except requests.RequestException as e:
+        print(f"Error downloading {url}: {e}")
+        return ""
+
+def create_database_connection():
+    """
+    Creates a database connection using the environment variables
+    :param: auth_append: str = "" - If you want to use a different set of variables for persisitance of sessions
+    """
+    hostname = os.environ.get("POSTGRES_HOST")
+    user = os.environ.get("POSTGRES_USER")
+    password = os.environ.get("POSTGRES_PASSWORD")
+    database = os.environ.get("POSTGRES_DATABASE")
+    port = os.environ.get("POSTGRES_PORT") if os.environ.get("POSTGRES_PORT") else 6543
+    return PostgresHandler(host_name=hostname, username=user, password=password, database=database, port=port)
+
+
+def get_list_of_channels(database: PostgresHandler):
+    """
+    Get list of channels from the live table
+    Returns: List of tuples containing (name, channel_id)
+    """
+    import json
+    with open(os.path.join(os.path.dirname(__file__), '..', 'sql_table_config.json'), 'r') as f:
+        config = json.load(f)
+    table_name = config['TABLE_LIVE']
+    query = f"SELECT name, channel_id FROM {table_name}"
+    try:
+        result = database.execute_query(query)
+        return [(row[0], row[1]) for row in result] if result else []
+    except Exception as e:
+        print(f"Error getting channels from database: {e}")
+        return []
+
+
+def check_missing_dates_for_channel(database: PostgresHandler, channel_id: str, from_date: str, to_date: str):
+    """
+    Check which dates are missing for a specific channel in the historical table
+    Args:
+        database: PostgresHandler instance
+        channel_id: Channel ID to check
+        from_date: Start date in YYYY-MM-DD format
+        to_date: End date in YYYY-MM-DD format
+    Returns: List of missing dates in YYYY-MM-DD format
+    """
+    with open(os.path.join(os.path.dirname(__file__), '..', 'sql_table_config.json'), 'r') as f:
+        config = json.load(f)
+
+    table_name = config['TABLE_HISTORICAL']
+    start_date = datetime.strptime(from_date, '%Y-%m-%d')
+    end_date = datetime.strptime(to_date, '%Y-%m-%d')
+
+    all_dates = []
+    current_date = start_date
+    while current_date <= end_date:
+        all_dates.append(current_date.strftime('%Y-%m-%d'))
+        current_date += timedelta(days=1)
+    query = f"""
+    SELECT DISTINCT DATE(timestamp) as date_only
+    FROM {table_name}
+    WHERE channel_id = %s
+    AND DATE(timestamp) BETWEEN %s AND %s
+    ORDER BY date_only
+    """
+
+    try:
+        result = database.execute_query(query, (channel_id, from_date, to_date))
+        existing_dates = [row[0].strftime('%Y-%m-%d') for row in result] if result else []
+        missing_dates = [date for date in all_dates if date not in existing_dates]
+        return missing_dates
+    except Exception as e:
+        print(f"Error checking missing dates for channel {channel_id}: {e}")
+        return all_dates
+
+
+def convert_subscriber_count(sub_text: str) -> int:
+    """
+    Convert subscriber count text like '92.4K' to integer
+    Args:
+        sub_text: Subscriber count text (e.g., '92.4K', '1.2M', '5000')
+    Returns: Integer subscriber count
+    """
+    if not sub_text:
+        return 0
+    clean_text = re.sub(r'[^\d.KMBkmb]', '', sub_text.upper())
+    if 'K' in clean_text:
+        number = float(clean_text.replace('K', ''))
+        return int(number * 1000)
+    elif 'M' in clean_text:
+        number = float(clean_text.replace('M', ''))
+        return int(number * 1000000)
+    elif 'B' in clean_text:
+        number = float(clean_text.replace('B', ''))
+        return int(number * 1000000000)
+    else:
+        try:
+            return int(float(clean_text))
+        except ValueError:
+            return 0
+
+
+def scrape_socialblade_data_for_date(channel_id: str, target_date: str) -> dict:
+    """
+    Scrape subscriber count for a specific date from SocialBlade historical data
+    Args:
+        channel_id: YouTube channel ID
+        target_date: Date in YYYY-MM-DD format
+    Returns: Dictionary with scraped data for that date
+    """
+    url = f"https://socialblade.com/youtube/channel/{channel_id}"
+    html_content = download_site_as_html(url)
+
+    if not html_content:
+        return {}
+
+    try:
+        soup = BeautifulSoup(html_content, 'html.parser')
+        tables = soup.find_all('table')
+        subscriber_count = 0
+
+        for table in tables:
+            rows = table.find_all('tr')
+            for row in rows:
+                cells = row.find_all('td')
+                if len(cells) >= 3:
+                    date_cell = cells[0].get_text().strip()
+                    date_match = re.search(r'(\d{4}-\d{2}-\d{2})', date_cell)
+                    if date_match and date_match.group(1) == target_date:
+                        if len(cells) >= 3:
+                            sub_text = cells[2].get_text().strip()
+                            subscriber_count = convert_subscriber_count(sub_text)
+                            break
+
+            if subscriber_count > 0:
+                break
+        if subscriber_count == 0:
+            stats_elements = soup.find_all('p', class_='text-[1.25em]')
+            for element in stats_elements:
+                parent = element.parent
+                if parent and 'subscribers' in parent.get_text().lower():
+                    subscriber_count = convert_subscriber_count(element.get_text().strip())
+                    break
+
+        return {
+            'subscriber_count': subscriber_count,
+            'channel_id': channel_id,
+            'date': target_date
+        }
+    except Exception as e:
+        print(f"Error parsing SocialBlade data for {channel_id} on {target_date}: {e}")
+        return {}
+
+
+def get_profile_pic_from_live(database: PostgresHandler, channel_id: str) -> str:
+    """
+    Get profile pic URL from live table for a given channel
+    Args:
+        database: PostgresHandler instance
+        channel_id: YouTube channel ID
+    Returns: Profile pic URL or placeholder if not found
+    """
+    with open(os.path.join(os.path.dirname(__file__), '..', 'sql_table_config.json'), 'r') as f:
+        config = json.load(f)
+
+    table_name = config['TABLE_LIVE']
+    query = f"SELECT profile_pic FROM {table_name} WHERE channel_id = %s"
+
+    try:
+        result = database.execute_query(query, (channel_id,))
+        if result and len(result) > 0:
+            return result[0][0]
+        else:
+            return f"https://yt3.ggpht.com/placeholder"
+    except Exception as e:
+        print(f"Error getting profile pic for {channel_id}: {e}")
+        return f"https://yt3.ggpht.com/placeholder"
+
+
+def insert_historical_data(database: PostgresHandler, channel_id: str, name: str, subscriber_count: int, date: str):
+    """
+    Insert scraped data into historical table
+    Args:
+        database: PostgresHandler instance
+        channel_id: YouTube channel ID
+        name: Channel name
+        subscriber_count: Subscriber count
+        date: Date string in YYYY-MM-DD format
+    """
+    with open(os.path.join(os.path.dirname(__file__), '..', 'sql_table_config.json'), 'r') as f:
+        config = json.load(f)
+
+    table_name = config['TABLE_HISTORICAL']
+    timestamp = f"{date} 00:00:00"  # Set to midnight
+    profile_pic = get_profile_pic_from_live(database, channel_id)
+
+    query = f"""
+    INSERT INTO {table_name} (channel_id, profile_pic, name, subscriber_count, timestamp)
+    VALUES (%s, %s, %s, %s, %s)
+    """
+
+    try:
+        database.execute_query(query, (channel_id, profile_pic, name, subscriber_count, timestamp))
+        print(f"  Inserted data for {date}: {subscriber_count} subscribers")
+    except Exception as e:
+        print(f"  Error inserting data for {date}: {e}")
+
+
+def stub_insert_historical_data(database: PostgresHandler, channel_id: str, name: str, subscriber_count: int, date: str):
+    """
+    Stub function to print what would be inserted into historical table
+    """
+    timestamp = f"{date} 00:00:00"  # Set to midnight
+    profile_pic = get_profile_pic_from_live(database, channel_id)
+
+    print(f"  STUB: Would insert into historical table:")
+    print(f"    Channel ID: {channel_id}")
+    print(f"    Name: {name}")
+    print(f"    Subscriber Count: {subscriber_count}")
+    print(f"    Timestamp: {timestamp}")
+    print(f"    Profile Pic: {profile_pic}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Recovery script for NijiTrack API')
+    parser.add_argument('--from', dest='from_date', required=True,
+                       help='Start date in YYYY-MM-DD format')
+    parser.add_argument('--to', dest='to_date', required=True,
+                       help='End date in YYYY-MM-DD format')
+    parser.add_argument('--dry-run', action='store_true',
+                       help='Only show what would be inserted, do not write to database')
+
+    args = parser.parse_args()
+    try:
+        datetime.strptime(args.from_date, '%Y-%m-%d')
+        datetime.strptime(args.to_date, '%Y-%m-%d')
+    except ValueError:
+        print("Error: Dates must be in YYYY-MM-DD format")
+        sys.exit(1)
+    if datetime.strptime(args.from_date, '%Y-%m-%d') > datetime.strptime(args.to_date, '%Y-%m-%d'):
+        print("Error: From date cannot be later than to date")
+        sys.exit(1)
+
+    database = create_database_connection()
+    channels = get_list_of_channels(database)
+
+    print(f"Checking for missing dates from {args.from_date} to {args.to_date}")
+    print(f"Found {len(channels)} channels to process")
+
+    for name, channel_id in channels:
+        print(f"\nProcessing channel: {name} ({channel_id})")
+        missing_dates = check_missing_dates_for_channel(database, channel_id, args.from_date, args.to_date)
+
+        if missing_dates:
+            print(f"  Found {len(missing_dates)} missing dates")
+            for date in missing_dates:
+                print(f"    Processing date: {date}")
+                socialblade_data = scrape_socialblade_data_for_date(channel_id, date)
+
+                if socialblade_data and socialblade_data.get('subscriber_count', 0) > 0:
+                    subscriber_count = socialblade_data['subscriber_count']
+                    print(f"    Scraped subscriber count for {date}: {subscriber_count}")
+                    if args.dry_run:
+                        stub_insert_historical_data(database, channel_id, name, subscriber_count, date)
+                    else:
+                        insert_historical_data(database, channel_id, name, subscriber_count, date)
+                else:
+                    print(f"    Failed to scrape data for {channel_id} on {date}")
+                time.sleep(1.0)
+        else:
+            print(f"  No missing dates found")
author	Pinapelz <yukais@pinapelz.com>	2025-08-21 15:14:07 -0700
committer	Pinapelz <yukais@pinapelz.com>	2025-08-21 15:14:26 -0700
commit	faaecf3b755f1e64be7baee214641ef540e7e57c (patch)
tree	dcec1035ceb0c970efb3899cc999776bce672d33 /recovery
parent	e14a40cec09100feceb49487508678a8b0e05d9d (diff)