From 403f2004c1ac19299390550bfda2fff7adcf5142 Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Thu, 28 May 2026 11:48:42 -0700 Subject: convert captcha scraper to a submodule --- captcha_scraper/scrape_data.py | 114 ----------------------------------------- 1 file changed, 114 deletions(-) delete mode 100644 captcha_scraper/scrape_data.py (limited to 'captcha_scraper/scrape_data.py') diff --git a/captcha_scraper/scrape_data.py b/captcha_scraper/scrape_data.py deleted file mode 100644 index 3a31295..0000000 --- a/captcha_scraper/scrape_data.py +++ /dev/null @@ -1,114 +0,0 @@ -from pathlib import Path -import os -from bs4 import BeautifulSoup -from urllib.parse import urlparse -import re -import requests - -from groups import PULL_GROUPS, EXCLUSIONS - -KPOPPING_BASE = "https://kpopping.com" -IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"} -OUTPUT_DIR = Path("../captcha-original") - -def fetch_female_idols() -> dict: - has_more = True - idx = 1 - result = {} - while has_more: - response = requests.get(f"{KPOPPING_BASE}/api/index-entries?type=idols&range={idx}&gender=female") - api_response = dict(response.json()) - member_data = api_response["data"] - for curr_letter_list in member_data: - for member in curr_letter_list["entries"]: - if "group" not in member: - continue - group = member["group"] - if group not in result: - result[group] = [member] - else: - result[group].append(member) - has_more = bool(api_response["hasMore"]) - idx += 1 - return result - -def extract_idol_id(idol_name_id: str): - response = requests.get(f"{KPOPPING_BASE}/profiles/idol/{idol_name_id}") - soup = BeautifulSoup(response.text, 'html.parser') - a_tags = soup.find_all("a") - for tag in a_tags: - href = tag.get("href") - idol_id_regex =re.search("^\/kpics\?idol=([^&]+)&idolName=.+$", href) - if idol_id_regex is None: - continue - return idol_id_regex.group(1) - return None - -def download_images(group_data): - for group in PULL_GROUPS: - dest = OUTPUT_DIR / group - dest.mkdir(parents=True,exist_ok=True) - if group not in PULL_GROUPS: - print(f"{group} not in {PULL_GROUPS}") - continue - exclude_list = [] - if group in EXCLUSIONS: - exclude_list = EXCLUSIONS[group] - for member in group_data[group]: - if member["id"] in exclude_list: - continue - print(f"Downloading images for {member["name"]}") - member_id = member["id"] - idol_id = extract_idol_id(member_id) - photo_api_url = f"{KPOPPING_BASE}/api/idol-sections?idolId={idol_id}§ion=photos" - response = requests.get(photo_api_url) - if not response: - print(f"Failed to get photos for {member}") - photo_data = dict(response.json()) - progress = 1 - photos_found = len(photo_data["photos"]) - for photo in photo_data["photos"]: - print(f"Now downloading {photo["slug"]} ({progress}/{photos_found})") - url = photo["src"] - filename = Path(urlparse(url).path).name - out_path = dest / filename - resp = requests.get(url, stream=True, timeout=15) - resp.raise_for_status() - with open(out_path, "wb") as f: - for chunk in resp.iter_content(8192): - if chunk: - f.write(chunk) - progress += 1 - - -def renumber_images_recursive(root_folder: str | Path) -> None: - root = Path(root_folder) - for current_dir, _, _ in os.walk(root): - directory = Path(current_dir) - images = sorted( - [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS], - key=lambda p: p.name.lower(), - ) - if not images: - continue - - temp_files = [] - for i, image in enumerate(images): - temp = directory / f"__tmp_{i+1}{image.suffix.lower()}" - image.rename(temp) - temp_files.append(temp) - - for i, temp in enumerate(temp_files): - temp.rename(directory / f"{i+1}{temp.suffix.lower()}") - - print(f"Renumbered {len(images)} images in {directory}") - -def main(): - group_data = fetch_female_idols() - download_images(group_data) - renumber_images_recursive("../captcha-original") - - - -if __name__ == "__main__": - main() -- cgit v1.2.3