#!/usr/bin/env python3
""" Quick and dirty copy comments from Wordpress backup.

    Drops the time part of the posted date as that's not useful for me, and
    flattens comments (Wordpress indents replies to previous comments). For me,
    date order happened to match the nested order, so flattening was fine, but
    if you had more comments, I imagine some comments might lose their context,
    making it unclear what they were replying to.
"""
from datetime import datetime
import glob
import os
import re
import unicodedata
import xml.etree.ElementTree as ET

uncommon_chars = set()

outdir = 'theme/templates/comments/'

for p in glob.glob(outdir + '*.comments.html'):
    os.unlink(p)

tree = ET.parse('shouldbesimple.WordPress.2025-07-07.xml')
ns = {'wp': 'http://wordpress.org/export/1.2/'}
for item in tree.getroot().findall(".//wp:post_type[.='post']/..", ns):
    if item.find('wp:status', ns).text == 'publish':
        comments = item.findall(".//wp:comment_type[.!='pingback']/..", ns)
        comments.sort(key=lambda c: datetime.fromisoformat(c.find('wp:comment_date', ns).text))
        if comments:
            slug = item.find('wp:post_name', ns).text
            assert re.match(r'^[a-z\-]+$', slug), slug
            with open(outdir + slug + '.comments.html', 'w', encoding='utf-8') as f:
                for comment in comments:
                    author = comment.find('wp:comment_author', ns).text or 'Anonymous'
                    assert re.match(r'^[a-zA-Z ()@]+$', author), author
                    comment_id = int(comment.find('wp:comment_id', ns).text)
                    body = (comment.find('wp:comment_content', ns).text
                        .replace('\xa0', '&nbsp;')
                        .replace('\n', '<br>') )
                    uncommon_chars |= set(re.findall(r'[^a-zA-Z0-9 .<>&;]', body))
                    comment_date = datetime.fromisoformat(
                        # I see no need to keep exact time of posting
                        comment.find('wp:comment_date', ns).text).date()
                    if not body.startswith('<'):
                        body = '<p>%s</p>' % body
                    f.write(
                        f'<blockquote id="comment-{comment_id}">\n'
                        f'<cite>By {author} <a href="#comment-{comment_id}">'
                            f'<time datetime="{comment_date.isoformat()}"'
                                f'>{comment_date.strftime('%B %-d, %Y').replace(' 0', ' ')}</time></a></cite>\n'
                        f'{body}\n</blockquote>\n\n'
                    )

for c in sorted(uncommon_chars):
    try:
        char_name = unicodedata.name(c)
        print(repr(c), char_name)
    except Exception as e:
        print(repr(c), e)