diff options
Diffstat (limited to 'captcha_scraper/scrape_data.py')
| -rw-r--r-- | captcha_scraper/scrape_data.py | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/captcha_scraper/scrape_data.py b/captcha_scraper/scrape_data.py new file mode 100644 index 0000000..dcfaa86 --- /dev/null +++ b/captcha_scraper/scrape_data.py @@ -0,0 +1,79 @@ +import os +from pathlib import Path +from urllib.parse import urlparse + +import requests + +from groups import PULL_GROUPS + +KPOPPING_BASE = "https://kpopping.com" +IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"} + + +def fetch_female_idols() -> dict: + has_more = True + idx = 1 + result = {} + while has_more: + response = requests.get(f"{KPOPPING_BASE}/api/index-entries?type=idols&range={idx}&gender=female") + api_response = dict(response.json()) + member_data = api_response["data"] + for curr_letter_list in member_data: + for member in curr_letter_list["entries"]: + if "group" not in member: + continue + group = member["group"] + if group not in result: + result[group] = [member] + else: + result[group].append(member) + has_more = bool(api_response["hasMore"]) + idx += 1 + return result + + +def download_images(group_data): + base_dir = Path(__file__).resolve().parent + total = 0 + for group_name in PULL_GROUPS: + members = group_data.get(group_name) + if not members: + print(f"Group '{group_name}' not found in fetched data, skipping.") + continue + group_dir = base_dir / group_name + group_dir.mkdir(parents=True, exist_ok=True) + existing_indices = [] + for p in group_dir.iterdir(): + if p.is_file() and p.stem.isdigit(): + existing_indices.append(int(p.stem)) + next_index = max(existing_indices) + 1 if existing_indices else 0 + + for member in members: + image_url = member.get("image") + if not image_url: + continue + parsed = urlparse(image_url) + ext = os.path.splitext(parsed.path)[1].lower() or ".webp" + dest = group_dir / f"{next_index}{ext}" + while dest.exists(): + next_index += 1 + dest = group_dir / f"{next_index}{ext}" + try: + resp = requests.get(image_url, timeout=20) + resp.raise_for_status() + with open(dest, "wb") as f: + f.write(resp.content) + print(f"Saved {dest}") + total += 1 + except Exception as e: + print(f"Failed to download {image_url}: {e}") + next_index += 1 + print(f"Downloaded {total} images.") + +def main(): + group_data = fetch_female_idols() + download_images(group_data) + + +if __name__ == "__main__": + main() |
