1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
|
from pathlib import Path
import os
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import re
import requests
from groups import PULL_GROUPS, EXCLUSIONS
KPOPPING_BASE = "https://kpopping.com"
IMAGE_EXTENSIONS = {".jpg", ".jpeg", ".png", ".webp", ".gif", ".bmp", ".tiff", ".tif", ".avif"}
OUTPUT_DIR = Path("../captcha-original")
def fetch_female_idols() -> dict:
has_more = True
idx = 1
result = {}
while has_more:
response = requests.get(f"{KPOPPING_BASE}/api/index-entries?type=idols&range={idx}&gender=female")
api_response = dict(response.json())
member_data = api_response["data"]
for curr_letter_list in member_data:
for member in curr_letter_list["entries"]:
if "group" not in member:
continue
group = member["group"]
if group not in result:
result[group] = [member]
else:
result[group].append(member)
has_more = bool(api_response["hasMore"])
idx += 1
return result
def extract_idol_id(idol_name_id: str):
response = requests.get(f"{KPOPPING_BASE}/profiles/idol/{idol_name_id}")
soup = BeautifulSoup(response.text, 'html.parser')
a_tags = soup.find_all("a")
for tag in a_tags:
href = tag.get("href")
idol_id_regex =re.search("^\/kpics\?idol=([^&]+)&idolName=.+$", href)
if idol_id_regex is None:
continue
return idol_id_regex.group(1)
return None
def download_images(group_data):
for group in PULL_GROUPS:
dest = OUTPUT_DIR / group
dest.mkdir(parents=True,exist_ok=True)
if group not in PULL_GROUPS:
print(f"{group} not in {PULL_GROUPS}")
continue
exclude_list = []
if group in EXCLUSIONS:
exclude_list = EXCLUSIONS[group]
for member in group_data[group]:
if member["id"] in exclude_list:
continue
print(f"Downloading images for {member["name"]}")
member_id = member["id"]
idol_id = extract_idol_id(member_id)
photo_api_url = f"{KPOPPING_BASE}/api/idol-sections?idolId={idol_id}§ion=photos"
response = requests.get(photo_api_url)
if not response:
print(f"Failed to get photos for {member}")
photo_data = dict(response.json())
progress = 1
photos_found = len(photo_data["photos"])
for photo in photo_data["photos"]:
print(f"Now downloading {photo["slug"]} ({progress}/{photos_found})")
url = photo["src"]
filename = Path(urlparse(url).path).name
out_path = dest / filename
resp = requests.get(url, stream=True, timeout=15)
resp.raise_for_status()
with open(out_path, "wb") as f:
for chunk in resp.iter_content(8192):
if chunk:
f.write(chunk)
progress += 1
def renumber_images_recursive(root_folder: str | Path) -> None:
root = Path(root_folder)
for current_dir, _, _ in os.walk(root):
directory = Path(current_dir)
images = sorted(
[p for p in directory.iterdir() if p.is_file() and p.suffix.lower() in IMAGE_EXTENSIONS],
key=lambda p: p.name.lower(),
)
if not images:
continue
temp_files = []
for i, image in enumerate(images):
temp = directory / f"__tmp_{i}{image.suffix.lower()}"
image.rename(temp)
temp_files.append(temp)
for i, temp in enumerate(temp_files):
temp.rename(directory / f"{i}{temp.suffix.lower()}")
print(f"Renumbered {len(images)} images in {directory}")
def main():
group_data = fetch_female_idols()
download_images(group_data)
renumber_images_recursive("../captcha-original")
if __name__ == "__main__":
main()
|