#!/usr/bin/env python3 """ Quick and dirty copy comments from Wordpress backup. Drops the time part of the posted date as that's not useful for me, and flattens comments (Wordpress indents replies to previous comments). For me, date order happened to match the nested order, so flattening was fine, but if you had more comments, I imagine some comments might lose their context, making it unclear what they were replying to. """ from datetime import datetime import glob import os import re import unicodedata import xml.etree.ElementTree as ET uncommon_chars = set() outdir = 'theme/templates/comments/' for p in glob.glob(outdir + '*.comments.html'): os.unlink(p) tree = ET.parse('shouldbesimple.WordPress.2025-07-07.xml') ns = {'wp': 'http://wordpress.org/export/1.2/'} for item in tree.getroot().findall(".//wp:post_type[.='post']/..", ns): if item.find('wp:status', ns).text == 'publish': comments = item.findall(".//wp:comment_type[.!='pingback']/..", ns) comments.sort(key=lambda c: datetime.fromisoformat(c.find('wp:comment_date', ns).text)) if comments: slug = item.find('wp:post_name', ns).text assert re.match(r'^[a-z\-]+$', slug), slug with open(outdir + slug + '.comments.html', 'w', encoding='utf-8') as f: for comment in comments: author = comment.find('wp:comment_author', ns).text or 'Anonymous' assert re.match(r'^[a-zA-Z ()@]+$', author), author comment_id = int(comment.find('wp:comment_id', ns).text) body = (comment.find('wp:comment_content', ns).text .replace('\xa0', ' ') .replace('\n', '
') ) uncommon_chars |= set(re.findall(r'[^a-zA-Z0-9 .<>&;]', body)) comment_date = datetime.fromisoformat( # I see no need to keep exact time of posting comment.find('wp:comment_date', ns).text).date() if not body.startswith('<'): body = '

%s

' % body f.write( f'
\n' f'By {author} ' f'\n' f'{body}\n
\n\n' ) for c in sorted(uncommon_chars): try: char_name = unicodedata.name(c) print(repr(c), char_name) except Exception as e: print(repr(c), e)