From 8d4a3eeb3a68e39301caec1b2289783bd2bf7b6d Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Tue, 27 May 2025 15:54:20 -0700 Subject: wmmt: continue adding text until minimum char length is exceeded --- bandai_namco/wmmt.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/bandai_namco/wmmt.py b/bandai_namco/wmmt.py index fc0e4bf..b7ea927 100644 --- a/bandai_namco/wmmt.py +++ b/bandai_namco/wmmt.py @@ -144,8 +144,11 @@ def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VER if paragraphs: content = paragraphs[0].get_text(" ", strip=True) if content and len(content.split()) < 50 and len(paragraphs) > 1: - next_p_content = paragraphs[1].get_text(" ", strip=True) - content += " " + next_p_content + for paragraph in paragraphs[1:]: + next_p_content = paragraph.get_text(" ", strip=True) + content += " " + next_p_content + if len(content.split()) >= 50: + break images = [] seen_srcs = [] for img in container.find_all("img"): @@ -178,8 +181,11 @@ def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VER if paragraphs: content = paragraphs[0].get_text(" ", strip=True) if content and len(content.split()) < 50 and len(paragraphs) > 1: - next_p_content = paragraphs[1].get_text(" ", strip=True) - content += " " + next_p_content + for paragraph in paragraphs[1:]: + next_p_content = paragraph.get_text(" ", strip=True) + content += " " + next_p_content + if len(content.split()) >= 50: + break images = [] seen_srcs = [] for img in container.select("img"): @@ -214,8 +220,11 @@ def make_wmmt_news_extractor(identifier: str, version: constants.WANGAN_MAXI_VER if paragraphs: content = paragraphs[0].get_text(" ", strip=True) if content and len(content.split()) < 50 and len(paragraphs) > 1: - next_p_content = paragraphs[1].get_text(" ", strip=True) - content += " " + next_p_content + for paragraph in paragraphs[1:]: + next_p_content = paragraph.get_text(" ", strip=True) + content += " " + next_p_content + if len(content.split()) >= 50: + break images = [] seen_srcs = [] for img in container.select("img"): -- cgit v1.2.3