From 8d422f6224d5f02ac8a21428c951398a637e7e69 Mon Sep 17 00:00:00 2001 From: Pinapelz Date: Thu, 2 Oct 2025 21:38:13 -0700 Subject: fix: adjustments to CDATA processing for RSS feeds --- feed.py | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) (limited to 'feed.py') diff --git a/feed.py b/feed.py index 6a6d0ef..617be3e 100644 --- a/feed.py +++ b/feed.py @@ -57,7 +57,7 @@ def build_rss_from_news_feed(title: str, description: str, json_file_path: str, if jp_content: desc_parts.append(jp_content.strip().replace("\n", "
")) if en_headline or en_content: - desc_parts.append("
English Translation
") + desc_parts.append("
――――――――――――――――
English Translation
") if en_headline: desc_parts.append(f"{en_headline.strip()}
") if en_content: @@ -65,11 +65,9 @@ def build_rss_from_news_feed(title: str, description: str, json_file_path: str, desc_combined = "\n".join(desc_parts) - # Placeholder for CDATA desc_el = ET.SubElement(item, "description") - desc_el.text = f"__CDATA_PLACEHOLDER__{desc_combined}__END__" + desc_el.text = desc_combined - # pubDate if "timestamp" in post and post["timestamp"]: pub_date = datetime.fromtimestamp( post["timestamp"], timezone.utc @@ -91,15 +89,25 @@ def build_rss_from_news_feed(title: str, description: str, json_file_path: str, pass ET.SubElement(item, "enclosure", url=image_url, type=mime, length=length) - # Serialize XML - rough_xml = ET.tostring(rss, encoding="utf-8", xml_declaration=True) + # Convert to string for CDATA processing + xml_str = ET.tostring(rss, encoding="unicode", method="xml") + + # Process the XML string to wrap description content in CDATA + import re + + def replace_description(match): + content = match.group(1) + # Unescape the XML entities that were escaped by ET + content = content.replace('<', '<') + content = content.replace('>', '>') + content = content.replace('&', '&') + content = content.replace('"', '"') + content = content.replace(''', "'") + return '' + xml_str = re.sub(r'([^<]*)', replace_description, xml_str) + dom = minidom.parseString(xml_str) + pretty_xml = dom.toprettyxml(indent=" ") + pretty_xml = '\n'.join([line for line in pretty_xml.split('\n') if line.strip()]) - # Replace placeholders with real CDATA - final_xml = rough_xml.decode("utf-8").replace( - "__CDATA_PLACEHOLDER__", "" - ) - - # Pretty print - dom = minidom.parseString(final_xml) with open(output_path, "w", encoding="utf-8") as f: - f.write(dom.toprettyxml(indent=" ")) + f.write(pretty_xml) -- cgit v1.2.3