From 478b257fa4b5f09730f87e6bf35555a1062e26ac Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Wed, 27 May 2026 13:53:55 -0700 Subject: add kpop captcha scraper --- captcha_scraper/scrape_member.py | 86 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 captcha_scraper/scrape_member.py (limited to 'captcha_scraper/scrape_member.py') diff --git a/captcha_scraper/scrape_member.py b/captcha_scraper/scrape_member.py new file mode 100644 index 0000000..de50237 --- /dev/null +++ b/captcha_scraper/scrape_member.py @@ -0,0 +1,86 @@ +import os +from pathlib import Path +from urllib.parse import urlparse +import requests + + +IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"} + +def fetch_photos(url: str) -> list: + response = requests.get(url, timeout=30) + response.raise_for_status() + data = response.json() + return data.get("photos", []) + + +def download_photos_from_url(url: str, folder_name: str) -> int: + target_dir = Path(__file__).resolve().parents[1] / "captcha-original" / folder_name + target_dir.mkdir(parents=True, exist_ok=True) + downloaded = 0 + for photo in fetch_photos(url): + src = photo.get("src") + if not src: + continue + + filename = Path(urlparse(src).path).name + if not filename: + continue + + destination = target_dir / filename + if destination.exists(): + continue + + try: + image_response = requests.get(src, timeout=30) + image_response.raise_for_status() + destination.write_bytes(image_response.content) + print(f"Saved {destination}") + downloaded += 1 + except Exception as e: + print(f"Failed to download {src}: {e}") + + print(f"Downloaded {downloaded} files into {target_dir}") + return downloaded + + +def renumber_images_recursive(root_folder: str | Path) -> None: + root = Path(root_folder) + for current_dir, _, _ in os.walk(root): + directory = Path(current_dir) + images = sorted( + [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS], + key=lambda p: p.name.lower(), + ) + if not images: + continue + + temp_files = [] + for i, image in enumerate(images): + temp = directory / f"__tmp_{i}{image.suffix.lower()}" + image.rename(temp) + temp_files.append(temp) + + for i, temp in enumerate(temp_files): + temp.rename(directory / f"{i}{temp.suffix.lower()}") + + print(f"Renumbered {len(images)} images in {directory}") + + +def main(): + pass + # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=a1664634-5caf-45d3-a57f-49d99d929aa9§ion=photos", "NMIXX") + # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=4b6d6573-576e-45d6-8c50-73e885d36a3e§ion=photos", "NMIXX") + # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=ed6dc832-9b12-4be2-88f2-df7167b241d6§ion=photos", "NMIXX") + # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=cef6a133-21e5-4034-952f-1edc284a8620§ion=photos", "NMIXX") + # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=425a818e-b8c3-4f80-9f6c-9151dd490bab§ion=photos", "NMIXX") + # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=a9d03774-55a1-4ae3-ba4d-3852f674c2e0§ion=photos", "NMIXX") + # renumber_images_recursive("../captcha-original/NMIXX") + # aespa + # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=689e39d5-49f1-4454-9f22-82c2911c855d§ion=photos", "aespa") + # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=324e43db-9254-4e7e-adf5-48f5318850e2§ion=photos", "aespa") + # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=5307515b-2488-4043-bf00-06887c5b62aa§ion=photos", "aespa") + # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=077c4f02-7ca6-49a6-9daf-df1dabc55d0f§ion=photos", "aespa") + # renumber_images_recursive("../captcha-original/aespa") + +if __name__ == "__main__": + main() -- cgit v1.2.3