diff options
| author | Pinapelz <yukais@pinapelz.com> | 2026-05-27 16:20:27 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2026-05-27 16:20:27 -0700 |
| commit | f633e8b0d7c42079efb7f3885b96ab20f674302a (patch) | |
| tree | c9b4f24b91989f9eaafd63947cdaef9a546d7b33 | |
| parent | 478b257fa4b5f09730f87e6bf35555a1062e26ac (diff) | |
improve kpop captcha scraper to pull from member pages
| -rw-r--r-- | captcha_scraper/groups.py | 11 | ||||
| -rw-r--r-- | captcha_scraper/scrape_data.py | 109 | ||||
| -rw-r--r-- | captcha_scraper/scrape_member.py | 86 |
3 files changed, 82 insertions, 124 deletions
diff --git a/captcha_scraper/groups.py b/captcha_scraper/groups.py index 3fb2e73..e5dfb04 100644 --- a/captcha_scraper/groups.py +++ b/captcha_scraper/groups.py @@ -1,3 +1,12 @@ PULL_GROUPS = [ - "NewJeans", "NMIXX", "IVE", "TWICE", "RED VELVET", "BLACKPINK", "LE SSERAFIM", "aespa", "i-dle", "KATSEYE", "ILLIT", "ITZY" + "NewJeans", "NMIXX", "IVE", "TWICE", "Red Velvet", "BLACKPINK", "LE SSERAFIM", "aespa", "i-dle", "KATSEYE", "ILLIT", "ITZY" ] + +EXCLUSIONS = { + "NMIXX": ["Jini4"], + "LE SSERAFIM": ["Kim-Garam"], + "NewJeans": ["Danielle"], + "i-dle": ["Soojin"], + "ILLIT": ["Youngseo"], + "KATSEYE": ["Manon"] +} diff --git a/captcha_scraper/scrape_data.py b/captcha_scraper/scrape_data.py index dcfaa86..1e73d55 100644 --- a/captcha_scraper/scrape_data.py +++ b/captcha_scraper/scrape_data.py @@ -1,14 +1,15 @@ -import os from pathlib import Path +import os +from bs4 import BeautifulSoup from urllib.parse import urlparse - +import re import requests -from groups import PULL_GROUPS +from groups import PULL_GROUPS, EXCLUSIONS KPOPPING_BASE = "https://kpopping.com" IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"} - +OUTPUT_DIR = Path("../captcha-original") def fetch_female_idols() -> dict: has_more = True @@ -31,48 +32,82 @@ def fetch_female_idols() -> dict: idx += 1 return result +def extract_idol_id(idol_name_id: str): + response = requests.get(f"{KPOPPING_BASE}/profiles/idol/{idol_name_id}") + soup = BeautifulSoup(response.text, 'html.parser') + a_tags = soup.find_all("a") + for tag in a_tags: + href = tag.get("href") + idol_id_regex =re.search("^\/kpics\?idol=([^&]+)&idolName=.+$", href) + if idol_id_regex is None: + continue + return idol_id_regex.group(1) + return None def download_images(group_data): - base_dir = Path(__file__).resolve().parent - total = 0 - for group_name in PULL_GROUPS: - members = group_data.get(group_name) - if not members: - print(f"Group '{group_name}' not found in fetched data, skipping.") + for group in PULL_GROUPS: + dest = OUTPUT_DIR / group + dest.mkdir(parents=True,exist_ok=True) + if group not in PULL_GROUPS: + print(f"{group} not in {PULL_GROUPS}") continue - group_dir = base_dir / group_name - group_dir.mkdir(parents=True, exist_ok=True) - existing_indices = [] - for p in group_dir.iterdir(): - if p.is_file() and p.stem.isdigit(): - existing_indices.append(int(p.stem)) - next_index = max(existing_indices) + 1 if existing_indices else 0 - - for member in members: - image_url = member.get("image") - if not image_url: + exclude_list = [] + if group in EXCLUSIONS: + exclude_list = EXCLUSIONS[group] + for member in group_data[group]: + if member["id"] in exclude_list: continue - parsed = urlparse(image_url) - ext = os.path.splitext(parsed.path)[1].lower() or ".webp" - dest = group_dir / f"{next_index}{ext}" - while dest.exists(): - next_index += 1 - dest = group_dir / f"{next_index}{ext}" - try: - resp = requests.get(image_url, timeout=20) + print(f"Downloading images for {member["name"]}") + member_id = member["id"] + idol_id = extract_idol_id(member_id) + photo_api_url = f"{KPOPPING_BASE}/api/idol-sections?idolId={idol_id}§ion=photos" + response = requests.get(photo_api_url) + if not response: + print(f"Failed to get photos for {member}") + photo_data = dict(response.json()) + progress = 1 + photos_found = len(photo_data["photos"]) + for photo in photo_data["photos"]: + print(f"Now downloading {photo["slug"]} ({progress}/{photos_found})") + url = photo["src"] + filename = Path(urlparse(url).path).name + out_path = dest / filename + resp = requests.get(url, stream=True, timeout=15) resp.raise_for_status() - with open(dest, "wb") as f: - f.write(resp.content) - print(f"Saved {dest}") - total += 1 - except Exception as e: - print(f"Failed to download {image_url}: {e}") - next_index += 1 - print(f"Downloaded {total} images.") + with open(out_path, "wb") as f: + for chunk in resp.iter_content(8192): + if chunk: + f.write(chunk) + progress += 1 + + +def renumber_images_recursive(root_folder: str | Path) -> None: + root = Path(root_folder) + for current_dir, _, _ in os.walk(root): + directory = Path(current_dir) + images = sorted( + [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS], + key=lambda p: p.name.lower(), + ) + if not images: + continue + + temp_files = [] + for i, image in enumerate(images): + temp = directory / f"__tmp_{i}{image.suffix.lower()}" + image.rename(temp) + temp_files.append(temp) + + for i, temp in enumerate(temp_files): + temp.rename(directory / f"{i}{temp.suffix.lower()}") + + print(f"Renumbered {len(images)} images in {directory}") def main(): group_data = fetch_female_idols() download_images(group_data) + renumber_images_recursive("../captcha-original") + if __name__ == "__main__": diff --git a/captcha_scraper/scrape_member.py b/captcha_scraper/scrape_member.py deleted file mode 100644 index de50237..0000000 --- a/captcha_scraper/scrape_member.py +++ /dev/null @@ -1,86 +0,0 @@ -import os -from pathlib import Path -from urllib.parse import urlparse -import requests - - -IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"} - -def fetch_photos(url: str) -> list: - response = requests.get(url, timeout=30) - response.raise_for_status() - data = response.json() - return data.get("photos", []) - - -def download_photos_from_url(url: str, folder_name: str) -> int: - target_dir = Path(__file__).resolve().parents[1] / "captcha-original" / folder_name - target_dir.mkdir(parents=True, exist_ok=True) - downloaded = 0 - for photo in fetch_photos(url): - src = photo.get("src") - if not src: - continue - - filename = Path(urlparse(src).path).name - if not filename: - continue - - destination = target_dir / filename - if destination.exists(): - continue - - try: - image_response = requests.get(src, timeout=30) - image_response.raise_for_status() - destination.write_bytes(image_response.content) - print(f"Saved {destination}") - downloaded += 1 - except Exception as e: - print(f"Failed to download {src}: {e}") - - print(f"Downloaded {downloaded} files into {target_dir}") - return downloaded - - -def renumber_images_recursive(root_folder: str | Path) -> None: - root = Path(root_folder) - for current_dir, _, _ in os.walk(root): - directory = Path(current_dir) - images = sorted( - [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS], - key=lambda p: p.name.lower(), - ) - if not images: - continue - - temp_files = [] - for i, image in enumerate(images): - temp = directory / f"__tmp_{i}{image.suffix.lower()}" - image.rename(temp) - temp_files.append(temp) - - for i, temp in enumerate(temp_files): - temp.rename(directory / f"{i}{temp.suffix.lower()}") - - print(f"Renumbered {len(images)} images in {directory}") - - -def main(): - pass - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=a1664634-5caf-45d3-a57f-49d99d929aa9§ion=photos", "NMIXX") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=4b6d6573-576e-45d6-8c50-73e885d36a3e§ion=photos", "NMIXX") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=ed6dc832-9b12-4be2-88f2-df7167b241d6§ion=photos", "NMIXX") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=cef6a133-21e5-4034-952f-1edc284a8620§ion=photos", "NMIXX") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=425a818e-b8c3-4f80-9f6c-9151dd490bab§ion=photos", "NMIXX") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=a9d03774-55a1-4ae3-ba4d-3852f674c2e0§ion=photos", "NMIXX") - # renumber_images_recursive("../captcha-original/NMIXX") - # aespa - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=689e39d5-49f1-4454-9f22-82c2911c855d§ion=photos", "aespa") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=324e43db-9254-4e7e-adf5-48f5318850e2§ion=photos", "aespa") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=5307515b-2488-4043-bf00-06887c5b62aa§ion=photos", "aespa") - # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=077c4f02-7ca6-49a6-9daf-df1dabc55d0f§ion=photos", "aespa") - # renumber_images_recursive("../captcha-original/aespa") - -if __name__ == "__main__": - main() |
