aboutsummaryrefslogtreecommitdiffstats
path: root/captcha_scraper/scrape_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'captcha_scraper/scrape_data.py')
-rw-r--r--captcha_scraper/scrape_data.py79
1 files changed, 79 insertions, 0 deletions
diff --git a/captcha_scraper/scrape_data.py b/captcha_scraper/scrape_data.py
new file mode 100644
index 0000000..dcfaa86
--- /dev/null
+++ b/captcha_scraper/scrape_data.py
@@ -0,0 +1,79 @@
+import os
+from pathlib import Path
+from urllib.parse import urlparse
+
+import requests
+
+from groups import PULL_GROUPS
+
+KPOPPING_BASE = "https://kpopping.com"
+IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"}
+
+
+def fetch_female_idols() -> dict:
+ has_more = True
+ idx = 1
+ result = {}
+ while has_more:
+ response = requests.get(f"{KPOPPING_BASE}/api/index-entries?type=idols&range={idx}&gender=female")
+ api_response = dict(response.json())
+ member_data = api_response["data"]
+ for curr_letter_list in member_data:
+ for member in curr_letter_list["entries"]:
+ if "group" not in member:
+ continue
+ group = member["group"]
+ if group not in result:
+ result[group] = [member]
+ else:
+ result[group].append(member)
+ has_more = bool(api_response["hasMore"])
+ idx += 1
+ return result
+
+
+def download_images(group_data):
+ base_dir = Path(__file__).resolve().parent
+ total = 0
+ for group_name in PULL_GROUPS:
+ members = group_data.get(group_name)
+ if not members:
+ print(f"Group '{group_name}' not found in fetched data, skipping.")
+ continue
+ group_dir = base_dir / group_name
+ group_dir.mkdir(parents=True, exist_ok=True)
+ existing_indices = []
+ for p in group_dir.iterdir():
+ if p.is_file() and p.stem.isdigit():
+ existing_indices.append(int(p.stem))
+ next_index = max(existing_indices) + 1 if existing_indices else 0
+
+ for member in members:
+ image_url = member.get("image")
+ if not image_url:
+ continue
+ parsed = urlparse(image_url)
+ ext = os.path.splitext(parsed.path)[1].lower() or ".webp"
+ dest = group_dir / f"{next_index}{ext}"
+ while dest.exists():
+ next_index += 1
+ dest = group_dir / f"{next_index}{ext}"
+ try:
+ resp = requests.get(image_url, timeout=20)
+ resp.raise_for_status()
+ with open(dest, "wb") as f:
+ f.write(resp.content)
+ print(f"Saved {dest}")
+ total += 1
+ except Exception as e:
+ print(f"Failed to download {image_url}: {e}")
+ next_index += 1
+ print(f"Downloaded {total} images.")
+
+def main():
+ group_data = fetch_female_idols()
+ download_images(group_data)
+
+
+if __name__ == "__main__":
+ main()
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage