diff options
| author | Pinapelz <yukais@pinapelz.com> | 2026-05-27 16:20:27 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2026-05-27 16:20:27 -0700 |
| commit | f633e8b0d7c42079efb7f3885b96ab20f674302a (patch) | |
| tree | c9b4f24b91989f9eaafd63947cdaef9a546d7b33 /captcha_scraper/scrape_data.py | |
| parent | 478b257fa4b5f09730f87e6bf35555a1062e26ac (diff) | |
improve kpop captcha scraper to pull from member pages
Diffstat (limited to 'captcha_scraper/scrape_data.py')
| -rw-r--r-- | captcha_scraper/scrape_data.py | 109 |
1 files changed, 72 insertions, 37 deletions
diff --git a/captcha_scraper/scrape_data.py b/captcha_scraper/scrape_data.py index dcfaa86..1e73d55 100644 --- a/captcha_scraper/scrape_data.py +++ b/captcha_scraper/scrape_data.py @@ -1,14 +1,15 @@ -import os from pathlib import Path +import os +from bs4 import BeautifulSoup from urllib.parse import urlparse - +import re import requests -from groups import PULL_GROUPS +from groups import PULL_GROUPS, EXCLUSIONS KPOPPING_BASE = "https://kpopping.com" IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"} - +OUTPUT_DIR = Path("../captcha-original") def fetch_female_idols() -> dict: has_more = True @@ -31,48 +32,82 @@ def fetch_female_idols() -> dict: idx += 1 return result +def extract_idol_id(idol_name_id: str): + response = requests.get(f"{KPOPPING_BASE}/profiles/idol/{idol_name_id}") + soup = BeautifulSoup(response.text, 'html.parser') + a_tags = soup.find_all("a") + for tag in a_tags: + href = tag.get("href") + idol_id_regex =re.search("^\/kpics\?idol=([^&]+)&idolName=.+$", href) + if idol_id_regex is None: + continue + return idol_id_regex.group(1) + return None def download_images(group_data): - base_dir = Path(__file__).resolve().parent - total = 0 - for group_name in PULL_GROUPS: - members = group_data.get(group_name) - if not members: - print(f"Group '{group_name}' not found in fetched data, skipping.") + for group in PULL_GROUPS: + dest = OUTPUT_DIR / group + dest.mkdir(parents=True,exist_ok=True) + if group not in PULL_GROUPS: + print(f"{group} not in {PULL_GROUPS}") continue - group_dir = base_dir / group_name - group_dir.mkdir(parents=True, exist_ok=True) - existing_indices = [] - for p in group_dir.iterdir(): - if p.is_file() and p.stem.isdigit(): - existing_indices.append(int(p.stem)) - next_index = max(existing_indices) + 1 if existing_indices else 0 - - for member in members: - image_url = member.get("image") - if not image_url: + exclude_list = [] + if group in EXCLUSIONS: + exclude_list = EXCLUSIONS[group] + for member in group_data[group]: + if member["id"] in exclude_list: continue - parsed = urlparse(image_url) - ext = os.path.splitext(parsed.path)[1].lower() or ".webp" - dest = group_dir / f"{next_index}{ext}" - while dest.exists(): - next_index += 1 - dest = group_dir / f"{next_index}{ext}" - try: - resp = requests.get(image_url, timeout=20) + print(f"Downloading images for {member["name"]}") + member_id = member["id"] + idol_id = extract_idol_id(member_id) + photo_api_url = f"{KPOPPING_BASE}/api/idol-sections?idolId={idol_id}§ion=photos" + response = requests.get(photo_api_url) + if not response: + print(f"Failed to get photos for {member}") + photo_data = dict(response.json()) + progress = 1 + photos_found = len(photo_data["photos"]) + for photo in photo_data["photos"]: + print(f"Now downloading {photo["slug"]} ({progress}/{photos_found})") + url = photo["src"] + filename = Path(urlparse(url).path).name + out_path = dest / filename + resp = requests.get(url, stream=True, timeout=15) resp.raise_for_status() - with open(dest, "wb") as f: - f.write(resp.content) - print(f"Saved {dest}") - total += 1 - except Exception as e: - print(f"Failed to download {image_url}: {e}") - next_index += 1 - print(f"Downloaded {total} images.") + with open(out_path, "wb") as f: + for chunk in resp.iter_content(8192): + if chunk: + f.write(chunk) + progress += 1 + + +def renumber_images_recursive(root_folder: str | Path) -> None: + root = Path(root_folder) + for current_dir, _, _ in os.walk(root): + directory = Path(current_dir) + images = sorted( + [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS], + key=lambda p: p.name.lower(), + ) + if not images: + continue + + temp_files = [] + for i, image in enumerate(images): + temp = directory / f"__tmp_{i}{image.suffix.lower()}" + image.rename(temp) + temp_files.append(temp) + + for i, temp in enumerate(temp_files): + temp.rename(directory / f"{i}{temp.suffix.lower()}") + + print(f"Renumbered {len(images)} images in {directory}") def main(): group_data = fetch_female_idols() download_images(group_data) + renumber_images_recursive("../captcha-original") + if __name__ == "__main__": |
