aboutsummaryrefslogtreecommitdiffstats
path: root/captcha_scraper/scrape_data.py
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2026-05-27 16:20:27 -0700
committerPinapelz <yukais@pinapelz.com>2026-05-27 16:20:27 -0700
commitf633e8b0d7c42079efb7f3885b96ab20f674302a (patch)
treec9b4f24b91989f9eaafd63947cdaef9a546d7b33 /captcha_scraper/scrape_data.py
parent478b257fa4b5f09730f87e6bf35555a1062e26ac (diff)
improve kpop captcha scraper to pull from member pages
Diffstat (limited to 'captcha_scraper/scrape_data.py')
-rw-r--r--captcha_scraper/scrape_data.py109
1 files changed, 72 insertions, 37 deletions
diff --git a/captcha_scraper/scrape_data.py b/captcha_scraper/scrape_data.py
index dcfaa86..1e73d55 100644
--- a/captcha_scraper/scrape_data.py
+++ b/captcha_scraper/scrape_data.py
@@ -1,14 +1,15 @@
-import os
from pathlib import Path
+import os
+from bs4 import BeautifulSoup
from urllib.parse import urlparse
-
+import re
import requests
-from groups import PULL_GROUPS
+from groups import PULL_GROUPS, EXCLUSIONS
KPOPPING_BASE = "https://kpopping.com"
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"}
-
+OUTPUT_DIR = Path("../captcha-original")
def fetch_female_idols() -> dict:
has_more = True
@@ -31,48 +32,82 @@ def fetch_female_idols() -> dict:
idx += 1
return result
+def extract_idol_id(idol_name_id: str):
+ response = requests.get(f"{KPOPPING_BASE}/profiles/idol/{idol_name_id}")
+ soup = BeautifulSoup(response.text, 'html.parser')
+ a_tags = soup.find_all("a")
+ for tag in a_tags:
+ href = tag.get("href")
+ idol_id_regex =re.search("^\/kpics\?idol=([^&]+)&idolName=.+$", href)
+ if idol_id_regex is None:
+ continue
+ return idol_id_regex.group(1)
+ return None
def download_images(group_data):
- base_dir = Path(__file__).resolve().parent
- total = 0
- for group_name in PULL_GROUPS:
- members = group_data.get(group_name)
- if not members:
- print(f"Group '{group_name}' not found in fetched data, skipping.")
+ for group in PULL_GROUPS:
+ dest = OUTPUT_DIR / group
+ dest.mkdir(parents=True,exist_ok=True)
+ if group not in PULL_GROUPS:
+ print(f"{group} not in {PULL_GROUPS}")
continue
- group_dir = base_dir / group_name
- group_dir.mkdir(parents=True, exist_ok=True)
- existing_indices = []
- for p in group_dir.iterdir():
- if p.is_file() and p.stem.isdigit():
- existing_indices.append(int(p.stem))
- next_index = max(existing_indices) + 1 if existing_indices else 0
-
- for member in members:
- image_url = member.get("image")
- if not image_url:
+ exclude_list = []
+ if group in EXCLUSIONS:
+ exclude_list = EXCLUSIONS[group]
+ for member in group_data[group]:
+ if member["id"] in exclude_list:
continue
- parsed = urlparse(image_url)
- ext = os.path.splitext(parsed.path)[1].lower() or ".webp"
- dest = group_dir / f"{next_index}{ext}"
- while dest.exists():
- next_index += 1
- dest = group_dir / f"{next_index}{ext}"
- try:
- resp = requests.get(image_url, timeout=20)
+ print(f"Downloading images for {member["name"]}")
+ member_id = member["id"]
+ idol_id = extract_idol_id(member_id)
+ photo_api_url = f"{KPOPPING_BASE}/api/idol-sections?idolId={idol_id}&section=photos"
+ response = requests.get(photo_api_url)
+ if not response:
+ print(f"Failed to get photos for {member}")
+ photo_data = dict(response.json())
+ progress = 1
+ photos_found = len(photo_data["photos"])
+ for photo in photo_data["photos"]:
+ print(f"Now downloading {photo["slug"]} ({progress}/{photos_found})")
+ url = photo["src"]
+ filename = Path(urlparse(url).path).name
+ out_path = dest / filename
+ resp = requests.get(url, stream=True, timeout=15)
resp.raise_for_status()
- with open(dest, "wb") as f:
- f.write(resp.content)
- print(f"Saved {dest}")
- total += 1
- except Exception as e:
- print(f"Failed to download {image_url}: {e}")
- next_index += 1
- print(f"Downloaded {total} images.")
+ with open(out_path, "wb") as f:
+ for chunk in resp.iter_content(8192):
+ if chunk:
+ f.write(chunk)
+ progress += 1
+
+
+def renumber_images_recursive(root_folder: str | Path) -> None:
+ root = Path(root_folder)
+ for current_dir, _, _ in os.walk(root):
+ directory = Path(current_dir)
+ images = sorted(
+ [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS],
+ key=lambda p: p.name.lower(),
+ )
+ if not images:
+ continue
+
+ temp_files = []
+ for i, image in enumerate(images):
+ temp = directory / f"__tmp_{i}{image.suffix.lower()}"
+ image.rename(temp)
+ temp_files.append(temp)
+
+ for i, temp in enumerate(temp_files):
+ temp.rename(directory / f"{i}{temp.suffix.lower()}")
+
+ print(f"Renumbered {len(images)} images in {directory}")
def main():
group_data = fetch_female_idols()
download_images(group_data)
+ renumber_images_recursive("../captcha-original")
+
if __name__ == "__main__":
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage