aboutsummaryrefslogtreecommitdiffstats
path: root/bemani/sdvx.py
blob: 5a7d25cdd1ef9bf256f1c31d3108f3ff178d0991 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
from bs4 import BeautifulSoup
from datetime import datetime
from urllib.parse import urljoin

def parse_exceed_gear_news_site(html: str):
    base_url = "https://p.eagate.573.jp"
    soup = BeautifulSoup(html, 'html.parser')
    news_list = soup.select('.tab ul.news li')

    entries = []
    for li in news_list:
        date = li.select_one('strong')
        pre = li.select_one('pre')

        if not date or not pre:
            continue
        date_str = date.text.strip()
        try:
            dt = datetime.strptime(date_str, "%Y.%m.%d")
            timestamp = int(dt.timestamp())
        except ValueError:
            timestamp = None
        headline = li.select_one('p.notice')
        headline_text = headline.text.strip() if headline else None
        for tag in pre.select('font, b, u, span'):
            tag.unwrap()
        content = pre.get_text(separator='\n', strip=True)
        images = []
        for img in pre.select('img'):
            src = img.get('data-original') or img.get('src')
            if not src or src.startswith('data:'):
                continue
            src = urljoin(base_url, src)
            parent = img.find_parent('a')
            href = urljoin(base_url, parent['href']) if parent and parent.has_attr('href') else None
            if {'image': src, 'link': href} not in images:
                images.append({'image': src, 'link': href})

        entries.append({
            'date': date_str,
            'identifier': 'SOUND_VOLTEX',
            'type': None,
            'timestamp': timestamp,
            'headline': headline_text,
            'content': content,
            "url": None,
            'images': images,
            'is_ai_summary': False
        })

    return entries

def parse_nabla_news_site(html: str):
    base_url = "https://p.eagate.573.jp"
    soup = BeautifulSoup(html, 'html.parser')
    news_list = soup.select('#news-inner ul.news li')

    entries = []
    for li in news_list:
        strong_tags = li.select('strong')
        if not strong_tags:
            continue

        date = strong_tags[0]
        date_str = date.text.strip()
        try:
            dt = datetime.strptime(date_str, "%Y.%m.%d")
            timestamp = int(dt.timestamp())
        except ValueError:
            timestamp = None

        headline_text = None
        if len(strong_tags) > 1:
            headline_text = strong_tags[1].text.strip()

        for tag in li.select('font, b, u, span'):
            tag.unwrap()

        content_parts = []
        for node in li.contents:
            if hasattr(node, 'name'):
                if node.name == 'strong':
                    continue
                elif node.name == 'br':
                    content_parts.append('\n')
                elif node.name == 'a' and 'link-text' in node.get('class', []):
                    content_parts.append(node.text.strip())
                elif node.name not in ['img']:  # Skip image tags for content
                    content_parts.append(node.get_text(strip=True))
            else:
                text = str(node).strip()
                if text and text not in [date_str, headline_text]:
                    content_parts.append(text)

        content = '\n'.join(filter(None, content_parts)).strip()

        images = []
        for img in li.select('img'):
            src = img.get('data-original') or img.get('src')
            if not src or (isinstance(src, str) and src.startswith('data:')):
                continue
            if isinstance(src, str):
                src = urljoin(base_url, src)
                parent = img.find_parent('a')
                href = None
                if parent and hasattr(parent, 'get') and parent.get('href'):
                    href_val = parent.get('href')
                    if isinstance(href_val, str):
                        href = urljoin(base_url, href_val)

                image_entry = {'image': src, 'link': href}
                if image_entry not in images:
                    images.append(image_entry)

        entries.append({
            'date': date_str,
            'identifier': 'SOUND_VOLTEX',
            'type': None,
            'timestamp': timestamp,
            'headline': headline_text,
            'content': content,
            "url": None,
            'images': images,
            'is_ai_summary': False
        })

    return entries
send patches to the email below
yukais@pinapelz.com
include the subject [PATCH repo_name]
pinapelz.com
homepage