diff options
| author | Pinapelz <yukais@pinapelz.com> | 2025-10-02 21:38:13 -0700 |
|---|---|---|
| committer | Pinapelz <yukais@pinapelz.com> | 2025-10-02 21:39:14 -0700 |
| commit | 8d422f6224d5f02ac8a21428c951398a637e7e69 (patch) | |
| tree | 8539bbd702c0f0f2ebb9f7ff8b68c587aa298275 | |
| parent | 9b6920e633cfc98900d1cb817c8f59ab6b88af3a (diff) | |
fix: adjustments to CDATA processing for RSS feeds
| -rw-r--r-- | feed.py | 34 |
1 files changed, 21 insertions, 13 deletions
@@ -57,7 +57,7 @@ def build_rss_from_news_feed(title: str, description: str, json_file_path: str, if jp_content: desc_parts.append(jp_content.strip().replace("\n", "<br/>")) if en_headline or en_content: - desc_parts.append("<hr/><b>English Translation</b><br/>") + desc_parts.append("<br/>――――――――――――――――<br/><b>English Translation</b><br/>") if en_headline: desc_parts.append(f"<i>{en_headline.strip()}</i><br/>") if en_content: @@ -65,11 +65,9 @@ def build_rss_from_news_feed(title: str, description: str, json_file_path: str, desc_combined = "\n".join(desc_parts) - # Placeholder for CDATA desc_el = ET.SubElement(item, "description") - desc_el.text = f"__CDATA_PLACEHOLDER__{desc_combined}__END__" + desc_el.text = desc_combined - # pubDate if "timestamp" in post and post["timestamp"]: pub_date = datetime.fromtimestamp( post["timestamp"], timezone.utc @@ -91,15 +89,25 @@ def build_rss_from_news_feed(title: str, description: str, json_file_path: str, pass ET.SubElement(item, "enclosure", url=image_url, type=mime, length=length) - # Serialize XML - rough_xml = ET.tostring(rss, encoding="utf-8", xml_declaration=True) + # Convert to string for CDATA processing + xml_str = ET.tostring(rss, encoding="unicode", method="xml") - # Replace placeholders with real CDATA - final_xml = rough_xml.decode("utf-8").replace( - "__CDATA_PLACEHOLDER__", "<![CDATA[").replace("__END__", "]]>" - ) + # Process the XML string to wrap description content in CDATA + import re + + def replace_description(match): + content = match.group(1) + # Unescape the XML entities that were escaped by ET + content = content.replace('<', '<') + content = content.replace('>', '>') + content = content.replace('&', '&') + content = content.replace('"', '"') + content = content.replace(''', "'") + return '<description><![CDATA[' + content + ']]></description>' + xml_str = re.sub(r'<description>([^<]*)</description>', replace_description, xml_str) + dom = minidom.parseString(xml_str) + pretty_xml = dom.toprettyxml(indent=" ") + pretty_xml = '\n'.join([line for line in pretty_xml.split('\n') if line.strip()]) - # Pretty print - dom = minidom.parseString(final_xml) with open(output_path, "w", encoding="utf-8") as f: - f.write(dom.toprettyxml(indent=" ")) + f.write(pretty_xml) |
