diff options
Diffstat (limited to 'scrape_data.py')
| -rw-r--r-- | scrape_data.py | 112 |
1 files changed, 112 insertions, 0 deletions
diff --git a/scrape_data.py b/scrape_data.py new file mode 100644 index 0000000..2fd32a9 --- /dev/null +++ b/scrape_data.py @@ -0,0 +1,112 @@ +from pathlib import Path +import os +from bs4 import BeautifulSoup +from urllib.parse import urlparse +import re +import requests + +from groups import PULL_GROUPS, EXCLUSIONS + +KPOPPING_BASE = "https://kpopping.com" +IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"} +OUTPUT_DIR = Path("../captcha-original") + +def fetch_female_idols() -> dict: + has_more = True + idx = 1 + result = {} + while has_more: + response = requests.get(f"{KPOPPING_BASE}/api/index-entries?type=idols&range={idx}&gender=female") + api_response = dict(response.json()) + member_data = api_response["data"] + for curr_letter_list in member_data: + for member in curr_letter_list["entries"]: + if "group" not in member: + continue + group = member["group"] + if group not in result: + result[group] = [member] + else: + result[group].append(member) + has_more = bool(api_response["hasMore"]) + idx += 1 + return result + +def extract_idol_id(idol_name_id: str): + response = requests.get(f"{KPOPPING_BASE}/profiles/idol/{idol_name_id}") + soup = BeautifulSoup(response.text, 'html.parser') + a_tags = soup.find_all("a") + for tag in a_tags: + href = tag.get("href") + idol_id_regex =re.search("^\/kpics\?idol=([^&]+)&idolName=.+$", href) + if idol_id_regex is None: + continue + return idol_id_regex.group(1) + return None + +def download_images(group_data): + for group in PULL_GROUPS: + dest = OUTPUT_DIR / group + dest.mkdir(parents=True,exist_ok=True) + if group not in PULL_GROUPS: + print(f"{group} not in {PULL_GROUPS}") + continue + exclude_list = [] + if group in EXCLUSIONS: + exclude_list = EXCLUSIONS[group] + for member in group_data[group]: + if member["id"] in exclude_list: + continue + print(f"Downloading images for {member["name"]}") + member_id = member["id"] + idol_id = extract_idol_id(member_id) + photo_api_url = f"{KPOPPING_BASE}/api/idol-sections?idolId={idol_id}§ion=photos" + response = requests.get(photo_api_url) + if not response: + print(f"Failed to get photos for {member}") + photo_data = dict(response.json()) + progress = 1 + photos_found = len(photo_data["photos"]) + for photo in photo_data["photos"]: + print(f"Now downloading {photo["slug"]} ({progress}/{photos_found})") + url = photo["src"] + filename = Path(urlparse(url).path).name + out_path = dest / filename + resp = requests.get(url, stream=True, timeout=15) + resp.raise_for_status() + with open(out_path, "wb") as f: + for chunk in resp.iter_content(8192): + if chunk: + f.write(chunk) + progress += 1 + + +def renumber_images_recursive(root_folder: str | Path) -> None: + root = Path(root_folder) + for current_dir, _, _ in os.walk(root): + directory = Path(current_dir) + images = sorted( + [p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS], + key=lambda p: p.name.lower(), + ) + if not images: + continue + + temp_files = [] + for i, image in enumerate(images): + temp = directory / f"__tmp_{i+1}{image.suffix.lower()}" + image.rename(temp) + temp_files.append(temp) + + for i, temp in enumerate(temp_files): + temp.rename(directory / f"{i+1}{temp.suffix.lower()}") + + print(f"Renumbered {len(images)} images in {directory}") + +def main(): + group_data = fetch_female_idols() + download_images(group_data) + renumber_images_recursive("../captcha-original") + +if __name__ == "__main__": + main() |
