aboutsummaryrefslogtreecommitdiffstats
path: root/captcha_scraper/scrape_data.py
diff options
context:
space:
mode:
authorPinapelz <yukais@pinapelz.com>2026-05-28 11:48:42 -0700
committerPinapelz <yukais@pinapelz.com>2026-05-28 11:50:13 -0700
commit403f2004c1ac19299390550bfda2fff7adcf5142 (patch)
tree3a9c0da9e2dc8b45095c22cda3877272b13e2ac4 /captcha_scraper/scrape_data.py
parent7f1228d0af006cf1b36571ea7c97e0d70457aa94 (diff)
convert captcha scraper to a submodule
Diffstat (limited to 'captcha_scraper/scrape_data.py')
-rw-r--r--captcha_scraper/scrape_data.py114
1 files changed, 0 insertions, 114 deletions
diff --git a/captcha_scraper/scrape_data.py b/captcha_scraper/scrape_data.py
deleted file mode 100644
index 3a31295..0000000
--- a/captcha_scraper/scrape_data.py
+++ /dev/null
@@ -1,114 +0,0 @@
-from pathlib import Path
-import os
-from bs4 import BeautifulSoup
-from urllib.parse import urlparse
-import re
-import requests
-
-from groups import PULL_GROUPS, EXCLUSIONS
-
-KPOPPING_BASE = "https://kpopping.com"
-IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"}
-OUTPUT_DIR = Path("../captcha-original")
-
-def fetch_female_idols() -> dict:
- has_more = True
- idx = 1
- result = {}
- while has_more:
- response = requests.get(f"{KPOPPING_BASE}/api/index-entries?type=idols&range={idx}&gender=female")
- api_response = dict(response.json())
- member_data = api_response["data"]
- for curr_letter_list in member_data:
- for member in curr_letter_list["entries"]:
- if "group" not in member:
- continue
- group = member["group"]
- if group not in result:
- result[group] = [member]
- else:
- result[group].append(member)
- has_more = bool(api_response["hasMore"])
- idx += 1
- return result
-
-def extract_idol_id(idol_name_id: str):
- response = requests.get(f"{KPOPPING_BASE}/profiles/idol/{idol_name_id}")
- soup = BeautifulSoup(response.text, 'html.parser')
- a_tags = soup.find_all("a")
- for tag in a_tags:
- href = tag.get("href")
- idol_id_regex =re.search("^\/kpics\?idol=([^&]+)&idolName=.+$", href)
- if idol_id_regex is None:
- continue
- return idol_id_regex.group(1)
- return None
-
-def download_images(group_data):
- for group in PULL_GROUPS:
- dest = OUTPUT_DIR / group
- dest.mkdir(parents=True,exist_ok=True)
- if group not in PULL_GROUPS:
- print(f"{group} not in {PULL_GROUPS}")
- continue
- exclude_list = []
- if group in EXCLUSIONS:
- exclude_list = EXCLUSIONS[group]
- for member in group_data[group]:
- if member["id"] in exclude_list:
- continue
- print(f"Downloading images for {member["name"]}")
- member_id = member["id"]
- idol_id = extract_idol_id(member_id)
- photo_api_url = f"{KPOPPING_BASE}/api/idol-sections?idolId={idol_id}&section=photos"
- response = requests.get(photo_api_url)
- if not response:
- print(f"Failed to get photos for {member}")
- photo_data = dict(response.json())
- progress = 1
- photos_found = len(photo_data["photos"])
- for photo in photo_data["photos"]:
- print(f"Now downloading {photo["slug"]} ({progress}/{photos_found})")
- url = photo["src"]
- filename = Path(urlparse(url).path).name
- out_path = dest / filename
- resp = requests.get(url, stream=True, timeout=15)
- resp.raise_for_status()
- with open(out_path, "wb") as f:
- for chunk in resp.iter_content(8192):
- if chunk:
- f.write(chunk)
- progress += 1
-
-
-def renumber_images_recursive(root_folder: str | Path) -> None:
- root = Path(root_folder)
- for current_dir, _, _ in os.walk(root):
- directory = Path(current_dir)
- images = sorted(
- [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS],
- key=lambda p: p.name.lower(),
- )
- if not images:
- continue
-
- temp_files = []
- for i, image in enumerate(images):
- temp = directory / f"__tmp_{i+1}{image.suffix.lower()}"
- image.rename(temp)
- temp_files.append(temp)
-
- for i, temp in enumerate(temp_files):
- temp.rename(directory / f"{i+1}{temp.suffix.lower()}")
-
- print(f"Renumbered {len(images)} images in {directory}")
-
-def main():
- group_data = fetch_female_idols()
- download_images(group_data)
- renumber_images_recursive("../captcha-original")
-
-
-
-if __name__ == "__main__":
- main()
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage