aboutsummaryrefslogtreecommitdiffstats
path: root/scrape_data.py
diff options
context:
space:
mode:
Diffstat (limited to 'scrape_data.py')
-rw-r--r--scrape_data.py112
1 files changed, 112 insertions, 0 deletions
diff --git a/scrape_data.py b/scrape_data.py
new file mode 100644
index 0000000..2fd32a9
--- /dev/null
+++ b/scrape_data.py
@@ -0,0 +1,112 @@
+from pathlib import Path
+import os
+from bs4 import BeautifulSoup
+from urllib.parse import urlparse
+import re
+import requests
+
+from groups import PULL_GROUPS, EXCLUSIONS
+
+KPOPPING_BASE = "https://kpopping.com"
+IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"}
+OUTPUT_DIR = Path("../captcha-original")
+
+def fetch_female_idols() -> dict:
+ has_more = True
+ idx = 1
+ result = {}
+ while has_more:
+ response = requests.get(f"{KPOPPING_BASE}/api/index-entries?type=idols&range={idx}&gender=female")
+ api_response = dict(response.json())
+ member_data = api_response["data"]
+ for curr_letter_list in member_data:
+ for member in curr_letter_list["entries"]:
+ if "group" not in member:
+ continue
+ group = member["group"]
+ if group not in result:
+ result[group] = [member]
+ else:
+ result[group].append(member)
+ has_more = bool(api_response["hasMore"])
+ idx += 1
+ return result
+
+def extract_idol_id(idol_name_id: str):
+ response = requests.get(f"{KPOPPING_BASE}/profiles/idol/{idol_name_id}")
+ soup = BeautifulSoup(response.text, 'html.parser')
+ a_tags = soup.find_all("a")
+ for tag in a_tags:
+ href = tag.get("href")
+ idol_id_regex =re.search("^\/kpics\?idol=([^&]+)&idolName=.+$", href)
+ if idol_id_regex is None:
+ continue
+ return idol_id_regex.group(1)
+ return None
+
+def download_images(group_data):
+ for group in PULL_GROUPS:
+ dest = OUTPUT_DIR / group
+ dest.mkdir(parents=True,exist_ok=True)
+ if group not in PULL_GROUPS:
+ print(f"{group} not in {PULL_GROUPS}")
+ continue
+ exclude_list = []
+ if group in EXCLUSIONS:
+ exclude_list = EXCLUSIONS[group]
+ for member in group_data[group]:
+ if member["id"] in exclude_list:
+ continue
+ print(f"Downloading images for {member["name"]}")
+ member_id = member["id"]
+ idol_id = extract_idol_id(member_id)
+ photo_api_url = f"{KPOPPING_BASE}/api/idol-sections?idolId={idol_id}&section=photos"
+ response = requests.get(photo_api_url)
+ if not response:
+ print(f"Failed to get photos for {member}")
+ photo_data = dict(response.json())
+ progress = 1
+ photos_found = len(photo_data["photos"])
+ for photo in photo_data["photos"]:
+ print(f"Now downloading {photo["slug"]} ({progress}/{photos_found})")
+ url = photo["src"]
+ filename = Path(urlparse(url).path).name
+ out_path = dest / filename
+ resp = requests.get(url, stream=True, timeout=15)
+ resp.raise_for_status()
+ with open(out_path, "wb") as f:
+ for chunk in resp.iter_content(8192):
+ if chunk:
+ f.write(chunk)
+ progress += 1
+
+
+def renumber_images_recursive(root_folder: str | Path) -> None:
+ root = Path(root_folder)
+ for current_dir, _, _ in os.walk(root):
+ directory = Path(current_dir)
+ images = sorted(
+ [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS],
+ key=lambda p: p.name.lower(),
+ )
+ if not images:
+ continue
+
+ temp_files = []
+ for i, image in enumerate(images):
+ temp = directory / f"__tmp_{i+1}{image.suffix.lower()}"
+ image.rename(temp)
+ temp_files.append(temp)
+
+ for i, temp in enumerate(temp_files):
+ temp.rename(directory / f"{i+1}{temp.suffix.lower()}")
+
+ print(f"Renumbered {len(images)} images in {directory}")
+
+def main():
+ group_data = fetch_female_idols()
+ download_images(group_data)
+ renumber_images_recursive("../captcha-original")
+
+if __name__ == "__main__":
+ main()
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage