From f633e8b0d7c42079efb7f3885b96ab20f674302a Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Wed, 27 May 2026 16:20:27 -0700 Subject: improve kpop captcha scraper to pull from member pages --- captcha_scraper/scrape_member.py | 86 ---------------------------------------- 1 file changed, 86 deletions(-) delete mode 100644 captcha_scraper/scrape_member.py (limited to 'captcha_scraper/scrape_member.py') diff --git a/captcha_scraper/scrape_member.py b/captcha_scraper/scrape_member.py deleted file mode 100644 index de50237..0000000 --- a/captcha_scraper/scrape_member.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -from pathlib import Path -from urllib.parse import urlparse -import requests - - -IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"} - -def fetch_photos(url: str) -> list: - response = requests.get(url, timeout=30) - response.raise_for_status() - data = response.json() - return data.get("photos", []) - - -def download_photos_from_url(url: str, folder_name: str) -> int: - target_dir = Path(__file__).resolve().parents[1] / "captcha-original" / folder_name - target_dir.mkdir(parents=True, exist_ok=True) - downloaded = 0 - for photo in fetch_photos(url): - src = photo.get("src") - if not src: - continue - - filename = Path(urlparse(src).path).name - if not filename: - continue - - destination = target_dir / filename - if destination.exists(): - continue - - try: - image_response = requests.get(src, timeout=30) - image_response.raise_for_status() - destination.write_bytes(image_response.content) - print(f"Saved {destination}") - downloaded += 1 - except Exception as e: - print(f"Failed to download {src}: {e}") - - print(f"Downloaded {downloaded} files into {target_dir}") - return downloaded - - -def renumber_images_recursive(root_folder: str | Path) -> None: - root = Path(root_folder) - for current_dir, _, _ in os.walk(root): - directory = Path(current_dir) - images = sorted( - [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS], - key=lambda p: p.name.lower(), - ) - if not images: - continue - - temp_files = [] - for i, image in enumerate(images): - temp = directory / f"__tmp_{i}{image.suffix.lower()}" - image.rename(temp) - temp_files.append(temp) - - for i, temp in enumerate(temp_files): - temp.rename(directory / f"{i}{temp.suffix.lower()}") - - print(f"Renumbered {len(images)} images in {directory}") - - -def main(): - pass - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=a1664634-5caf-45d3-a57f-49d99d929aa9§ion=photos", "NMIXX") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=4b6d6573-576e-45d6-8c50-73e885d36a3e§ion=photos", "NMIXX") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=ed6dc832-9b12-4be2-88f2-df7167b241d6§ion=photos", "NMIXX") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=cef6a133-21e5-4034-952f-1edc284a8620§ion=photos", "NMIXX") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=425a818e-b8c3-4f80-9f6c-9151dd490bab§ion=photos", "NMIXX") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=a9d03774-55a1-4ae3-ba4d-3852f674c2e0§ion=photos", "NMIXX") - # renumber_images_recursive("../captcha-original/NMIXX") - # aespa - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=689e39d5-49f1-4454-9f22-82c2911c855d§ion=photos", "aespa") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=324e43db-9254-4e7e-adf5-48f5318850e2§ion=photos", "aespa") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=5307515b-2488-4043-bf00-06887c5b62aa§ion=photos", "aespa") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=077c4f02-7ca6-49a6-9daf-df1dabc55d0f§ion=photos", "aespa") - # renumber_images_recursive("../captcha-original/aespa") - -if __name__ == "__main__": - main() -- cgit v1.2.3