#!/usr/bin/env python3
""" Quick and dirty copy comments from Wordpress backup.
Drops the time part of the posted date as that's not useful for me, and
flattens comments (Wordpress indents replies to previous comments). For me,
date order happened to match the nested order, so flattening was fine, but
if you had more comments, I imagine some comments might lose their context,
making it unclear what they were replying to.
"""
from datetime import datetime
import glob
import os
import re
import unicodedata
import xml.etree.ElementTree as ET
uncommon_chars = set()
outdir = 'theme/templates/comments/'
for p in glob.glob(outdir + '*.comments.html'):
os.unlink(p)
tree = ET.parse('shouldbesimple.WordPress.2025-07-07.xml')
ns = {'wp': 'http://wordpress.org/export/1.2/'}
for item in tree.getroot().findall(".//wp:post_type[.='post']/..", ns):
if item.find('wp:status', ns).text == 'publish':
comments = item.findall(".//wp:comment_type[.!='pingback']/..", ns)
comments.sort(key=lambda c: datetime.fromisoformat(c.find('wp:comment_date', ns).text))
if comments:
slug = item.find('wp:post_name', ns).text
assert re.match(r'^[a-z\-]+$', slug), slug
with open(outdir + slug + '.comments.html', 'w', encoding='utf-8') as f:
for comment in comments:
author = comment.find('wp:comment_author', ns).text or 'Anonymous'
assert re.match(r'^[a-zA-Z ()@]+$', author), author
comment_id = int(comment.find('wp:comment_id', ns).text)
body = (comment.find('wp:comment_content', ns).text
.replace('\xa0', ' ')
.replace('\n', '
') )
uncommon_chars |= set(re.findall(r'[^a-zA-Z0-9 .<>&;]', body))
comment_date = datetime.fromisoformat(
# I see no need to keep exact time of posting
comment.find('wp:comment_date', ns).text).date()
if not body.startswith('<'):
body = '
%s
' % body f.write( f'\n' f'By {author} ' f'\n' f'{body}\n\n\n' ) for c in sorted(uncommon_chars): try: char_name = unicodedata.name(c) print(repr(c), char_name) except Exception as e: print(repr(c), e)