improve kpop captcha scraper to pull from member pages

author: Pinapelz <yukais@pinapelz.com> 2026-05-27 16:20:27 -0700
committer: Pinapelz <yukais@pinapelz.com> 2026-05-27 16:20:27 -0700
commit: f633e8b0d7c42079efb7f3885b96ab20f674302a (patch)
tree: c9b4f24b91989f9eaafd63947cdaef9a546d7b33
parent: 478b257fa4b5f09730f87e6bf35555a1062e26ac (diff)
3 files changed, 82 insertions, 124 deletions
diff --git a/captcha_scraper/groups.py b/captcha_scraper/groups.py
index 3fb2e73..e5dfb04 100644
--- a/captcha_scraper/groups.py
+++ b/captcha_scraper/groups.py
@@ -1,3 +1,12 @@
 PULL_GROUPS = [
-    "NewJeans", "NMIXX", "IVE", "TWICE", "RED VELVET", "BLACKPINK", "LE SSERAFIM", "aespa", "i-dle", "KATSEYE", "ILLIT", "ITZY"
+    "NewJeans", "NMIXX", "IVE", "TWICE", "Red Velvet", "BLACKPINK", "LE SSERAFIM", "aespa", "i-dle", "KATSEYE", "ILLIT", "ITZY"
 ]
+
+EXCLUSIONS = {
+    "NMIXX": ["Jini4"],
+    "LE SSERAFIM": ["Kim-Garam"],
+    "NewJeans": ["Danielle"],
+    "i-dle": ["Soojin"],
+    "ILLIT": ["Youngseo"],
+    "KATSEYE": ["Manon"]
+}
diff --git a/captcha_scraper/scrape_data.py b/captcha_scraper/scrape_data.py
index dcfaa86..1e73d55 100644
--- a/captcha_scraper/scrape_data.py
+++ b/captcha_scraper/scrape_data.py
@@ -1,14 +1,15 @@
-import os
 from pathlib import Path
+import os
+from bs4 import BeautifulSoup
 from urllib.parse import urlparse
-
+import re
 import requests
 
-from groups import PULL_GROUPS
+from groups import PULL_GROUPS, EXCLUSIONS
 
 KPOPPING_BASE = "https://kpopping.com"
 IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"}
-
+OUTPUT_DIR = Path("../captcha-original")
 
 def fetch_female_idols() -> dict:
     has_more = True
@@ -31,48 +32,82 @@ def fetch_female_idols() -> dict:
         idx += 1
     return result
 
+def extract_idol_id(idol_name_id: str):
+    response = requests.get(f"{KPOPPING_BASE}/profiles/idol/{idol_name_id}")
+    soup = BeautifulSoup(response.text, 'html.parser')
+    a_tags = soup.find_all("a")
+    for tag in a_tags:
+        href = tag.get("href")
+        idol_id_regex =re.search("^\/kpics\?idol=([^&]+)&idolName=.+$", href)
+        if idol_id_regex is None:
+            continue
+        return idol_id_regex.group(1)
+    return None
 
 def download_images(group_data):
-    base_dir = Path(__file__).resolve().parent
-    total = 0
-    for group_name in PULL_GROUPS:
-        members = group_data.get(group_name)
-        if not members:
-            print(f"Group '{group_name}' not found in fetched data, skipping.")
+    for group in PULL_GROUPS:
+        dest = OUTPUT_DIR / group
+        dest.mkdir(parents=True,exist_ok=True)
+        if group not in PULL_GROUPS:
+            print(f"{group} not in {PULL_GROUPS}")
             continue
-        group_dir = base_dir / group_name
-        group_dir.mkdir(parents=True, exist_ok=True)
-        existing_indices = []
-        for p in group_dir.iterdir():
-            if p.is_file() and p.stem.isdigit():
-                existing_indices.append(int(p.stem))
-        next_index = max(existing_indices) + 1 if existing_indices else 0
-
-        for member in members:
-            image_url = member.get("image")
-            if not image_url:
+        exclude_list = []
+        if group in EXCLUSIONS:
+            exclude_list = EXCLUSIONS[group]
+        for member in group_data[group]:
+            if member["id"] in exclude_list:
                 continue
-            parsed = urlparse(image_url)
-            ext = os.path.splitext(parsed.path)[1].lower() or ".webp"
-            dest = group_dir / f"{next_index}{ext}"
-            while dest.exists():
-                next_index += 1
-                dest = group_dir / f"{next_index}{ext}"
-            try:
-                resp = requests.get(image_url, timeout=20)
+            print(f"Downloading images for {member["name"]}")
+            member_id = member["id"]
+            idol_id = extract_idol_id(member_id)
+            photo_api_url = f"{KPOPPING_BASE}/api/idol-sections?idolId={idol_id}&section=photos"
+            response = requests.get(photo_api_url)
+            if not response:
+                print(f"Failed to get photos for {member}")
+            photo_data = dict(response.json())
+            progress = 1
+            photos_found = len(photo_data["photos"])
+            for photo in photo_data["photos"]:
+                print(f"Now downloading {photo["slug"]} ({progress}/{photos_found})")
+                url = photo["src"]
+                filename = Path(urlparse(url).path).name
+                out_path = dest / filename
+                resp = requests.get(url, stream=True, timeout=15)
                 resp.raise_for_status()
-                with open(dest, "wb") as f:
-                    f.write(resp.content)
-                print(f"Saved {dest}")
-                total += 1
-            except Exception as e:
-                print(f"Failed to download {image_url}: {e}")
-            next_index += 1
-    print(f"Downloaded {total} images.")
+                with open(out_path, "wb") as f:
+                    for chunk in resp.iter_content(8192):
+                        if chunk:
+                            f.write(chunk)
+                progress += 1
+
+
+def renumber_images_recursive(root_folder: str | Path) -> None:
+    root = Path(root_folder)
+    for current_dir, _, _ in os.walk(root):
+        directory = Path(current_dir)
+        images = sorted(
+            [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS],
+            key=lambda p: p.name.lower(),
+        )
+        if not images:
+            continue
+
+        temp_files = []
+        for i, image in enumerate(images):
+            temp = directory / f"__tmp_{i}{image.suffix.lower()}"
+            image.rename(temp)
+            temp_files.append(temp)
+
+        for i, temp in enumerate(temp_files):
+            temp.rename(directory / f"{i}{temp.suffix.lower()}")
+
+        print(f"Renumbered {len(images)} images in {directory}")
 
 def main():
     group_data = fetch_female_idols()
     download_images(group_data)
+    renumber_images_recursive("../captcha-original")
+
 
 
 if __name__ == "__main__":
diff --git a/captcha_scraper/scrape_member.py b/captcha_scraper/scrape_member.py
deleted file mode 100644
index de50237..0000000
--- a/captcha_scraper/scrape_member.py
+++ /dev/null
@@ -1,86 +0,0 @@
-import os
-from pathlib import Path
-from urllib.parse import urlparse
-import requests
-
-
-IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"}
-
-def fetch_photos(url: str) -> list:
-    response = requests.get(url, timeout=30)
-    response.raise_for_status()
-    data = response.json()
-    return data.get("photos", [])
-
-
-def download_photos_from_url(url: str, folder_name: str) -> int:
-    target_dir = Path(__file__).resolve().parents[1] / "captcha-original" / folder_name
-    target_dir.mkdir(parents=True, exist_ok=True)
-    downloaded = 0
-    for photo in fetch_photos(url):
-        src = photo.get("src")
-        if not src:
-            continue
-
-        filename = Path(urlparse(src).path).name
-        if not filename:
-            continue
-
-        destination = target_dir / filename
-        if destination.exists():
-            continue
-
-        try:
-            image_response = requests.get(src, timeout=30)
-            image_response.raise_for_status()
-            destination.write_bytes(image_response.content)
-            print(f"Saved {destination}")
-            downloaded += 1
-        except Exception as e:
-            print(f"Failed to download {src}: {e}")
-
-    print(f"Downloaded {downloaded} files into {target_dir}")
-    return downloaded
-
-
-def renumber_images_recursive(root_folder: str | Path) -> None:
-    root = Path(root_folder)
-    for current_dir, _, _ in os.walk(root):
-        directory = Path(current_dir)
-        images = sorted(
-            [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS],
-            key=lambda p: p.name.lower(),
-        )
-        if not images:
-            continue
-
-        temp_files = []
-        for i, image in enumerate(images):
-            temp = directory / f"__tmp_{i}{image.suffix.lower()}"
-            image.rename(temp)
-            temp_files.append(temp)
-
-        for i, temp in enumerate(temp_files):
-            temp.rename(directory / f"{i}{temp.suffix.lower()}")
-
-        print(f"Renumbered {len(images)} images in {directory}")
-
-
-def main():
-    pass
-    # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=a1664634-5caf-45d3-a57f-49d99d929aa9&section=photos", "NMIXX")
-    # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=4b6d6573-576e-45d6-8c50-73e885d36a3e&section=photos", "NMIXX")
-    # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=ed6dc832-9b12-4be2-88f2-df7167b241d6&section=photos", "NMIXX")
-    # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=cef6a133-21e5-4034-952f-1edc284a8620&section=photos", "NMIXX")
-    # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=425a818e-b8c3-4f80-9f6c-9151dd490bab&section=photos", "NMIXX")
-    # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=a9d03774-55a1-4ae3-ba4d-3852f674c2e0&section=photos", "NMIXX")
-    # renumber_images_recursive("../captcha-original/NMIXX")
-    # aespa
-    # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=689e39d5-49f1-4454-9f22-82c2911c855d&section=photos", "aespa")
-    # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=324e43db-9254-4e7e-adf5-48f5318850e2&section=photos", "aespa")
-    # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=5307515b-2488-4043-bf00-06887c5b62aa&section=photos", "aespa")
-    # download_photos_from_url("https://kpopping.com/api/idol-sections?idolId=077c4f02-7ca6-49a6-9daf-df1dabc55d0f&section=photos", "aespa")
-    # renumber_images_recursive("../captcha-original/aespa")
-
-if __name__ == "__main__":
-    main()
author	Pinapelz <yukais@pinapelz.com>	2026-05-27 16:20:27 -0700
committer	Pinapelz <yukais@pinapelz.com>	2026-05-27 16:20:27 -0700
commit	f633e8b0d7c42079efb7f3885b96ab20f674302a (patch)
tree	c9b4f24b91989f9eaafd63947cdaef9a546d7b33
parent	478b257fa4b5f09730f87e6bf35555a1062e26ac (diff)