samstokes/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Script to parse Disqus Comment Export XML as documented at
https://help.disqus.com/en/articles/1717164-comments-export
and translate it into a JSON format that is easier to handle (for example, using Jekyll's built-in support for JSON files in _data).
The imagined use case is if you have decided to remove Disqus comments from your blog (or maybe disable commenting entirely),
but have existing comments on previous posts that you want to keep. You can export your comments from Disqus, and statically render the historical comments,
without needing any Disqus Javascript or cookies.
The Disqus XML exports all categories, threads and posts into flat lists, with id references for parent relationships.
That makes it complicated to do things like "look up all comments on this blog post", or see which comments were in reply to which,
so this also reassembles the tree structure for easier usage.
This includes some bare-bones example HTML to demonstrate how to use the data. You will almost certainly want to edit the HTML and add some styling.
Feel to reuse this for your own blog. This could be packaged as a Jekyll generator gem if anyone

  
## _comments.html
{%- comment -%}
This is separate from disqus_comments.html because Disqus replies can be arbitrarily nested
(you can have replies to replies to replies, etc), so this partial needs to include itself
recursively.
{%- endcomment -%}

{% assign good_comments = include.comments | where: 'isSpam', 'false' | where: 'isDeleted', 'false' %}
<ul>
  {% for comment in good_comments %}
  <li>
    {{ comment.message }}
    {% include _comments.html comments=comment.replies %}
  </li>
  {% endfor %}
</ul>

## disqus_comments.html
{% assign relevant_categories = site.data.disqus_comments.categories | where:'forum', site.disqus.shortname %}
{% for category in relevant_categories %}
  <h2>Comments ({{category.title}})</h2>
  {% for old_link in page.redirect_from %}
    {% assign relevant_threads = category.threads | where_exp:'thread', 'thread.link contains old_link' %}
    {% for thread in relevant_threads %}
      <h3>{{thread.id}} ({{thread.link}})</h3>
      {% include _comments.html comments=thread.rootPosts %}
    {% endfor %}
  {% endfor %}
{% endfor %}

## disqus_xml_to_json.rb
# frozen_string_literal: true

# Parses Disqus Comment Export XML as documented at
# https://help.disqus.com/en/articles/1717164-comments-export
#
# Also reassembles the exported data into a tree structure for easier usage.
#
# Accepts .xml.gz input filename as first command-line argument, and prints
# JSON to standard output.
#
# Example usage:
# ruby ./_scripts/disqus_xml_to_json.rb /path/to/disqus-export.xml.gz > _data/disqus_export.json
#
# Then you should be able to access the categories, threads and posts in a post via `site.data.disqus_export.categories`.

require 'json'
require 'zlib'
require 'nokogiri'

def elem_to_o(elem)
  attrs = elem.attributes.map { |_, a| attr_pair(a) }.to_h

  if elem.children.empty?
    attrs
  elsif elem.children.all? { |elem| elem.text? || elem.cdata? }
    if attrs.empty?
      elem.text
    else
      attrs.merge('text' => elem.text)
    end
  else
    child_attrs = elem.element_children.map { |child| [child.name, elem_to_o(child)] }
    attrs.merge(child_attrs.to_h)
  end
end

def attr_pair(attr)
  name = if attr&.namespace&.prefix
           "#{attr.namespace.prefix}:#{attr.name}"
         else
           attr.name
         end
  [name, attr.value]
end

def reconstruct_tree!(posts)
  by_id = {}
  roots = []

  posts.each do |post|
    id = post.fetch('dsq:id')
    by_id[id] = post

    parent = post['parent']
    if parent
      parent_id = parent.fetch('dsq:id')
      # Docs specify "Parents should always exist before they are referenced"
      # so this lookup should not fail
      parent_post = by_id.fetch(parent_id)

      parent_replies = parent_post['replies'] ||= []
      parent_replies << post
    else
      roots << post
    end
  end

  roots
end

def embed_children!(parents, children_attr, parent_attr, children)
  parents_by_id = parents
                  .group_by { |parent| parent.fetch('dsq:id') }
                  .transform_values do |vs|
    v = vs.fetch(0)
    raise "more than one #{parent_attr} for id #{v.fetch('dsq:id')}" if vs.size > 1

    v
  end

  children.each do |child|
    parent_id = child.fetch(parent_attr).fetch('dsq:id')
    parent = parents_by_id.fetch(parent_id)
    parent_children = parent[children_attr] ||= []
    parent_children << child
  end

  parents
end

GZ_FILE = ARGV[0] or raise 'Please specify .xml.gz input as first argument.'

Zlib::GzipReader.open(GZ_FILE) do |xml|
  doc = Nokogiri::XML::Document.parse(xml)

  first_level_elements = doc.root.element_children.group_by(&:name)
  x_categories, x_threads, x_posts = first_level_elements.values_at('category', 'thread', 'post')

  categories = x_categories.map { |elem| elem_to_o(elem) }
  threads = x_threads.map { |elem| elem_to_o(elem) }
  posts = x_posts.map { |elem| elem_to_o(elem) }

  root_posts = reconstruct_tree!(posts)

  embed_children!(categories, 'threads', 'category', threads)
  embed_children!(threads, 'rootPosts', 'thread', root_posts)

  puts JSON.generate('categories' => categories)
end
	{%- comment -%}
	This is separate from disqus_comments.html because Disqus replies can be arbitrarily nested
	(you can have replies to replies to replies, etc), so this partial needs to include itself
	recursively.
	{%- endcomment -%}

	{% assign good_comments = include.comments \| where: 'isSpam', 'false' \| where: 'isDeleted', 'false' %}
	<ul>
	{% for comment in good_comments %}
	<li>
	{{ comment.message }}
	{% include _comments.html comments=comment.replies %}
	</li>
	{% endfor %}
	</ul>
	{% assign relevant_categories = site.data.disqus_comments.categories \| where:'forum', site.disqus.shortname %}
	{% for category in relevant_categories %}
	<h2>Comments ({{category.title}})</h2>
	{% for old_link in page.redirect_from %}
	{% assign relevant_threads = category.threads \| where_exp:'thread', 'thread.link contains old_link' %}
	{% for thread in relevant_threads %}
	<h3>{{thread.id}} ({{thread.link}})</h3>
	{% include _comments.html comments=thread.rootPosts %}
	{% endfor %}
	{% endfor %}
	{% endfor %}
	# frozen_string_literal: true

	# Parses Disqus Comment Export XML as documented at
	# https://help.disqus.com/en/articles/1717164-comments-export
	#
	# Also reassembles the exported data into a tree structure for easier usage.
	#
	# Accepts .xml.gz input filename as first command-line argument, and prints
	# JSON to standard output.
	#
	# Example usage:
	# ruby ./_scripts/disqus_xml_to_json.rb /path/to/disqus-export.xml.gz > _data/disqus_export.json
	#
	# Then you should be able to access the categories, threads and posts in a post via `site.data.disqus_export.categories`.

	require 'json'
	require 'zlib'
	require 'nokogiri'

	def elem_to_o(elem)
	attrs = elem.attributes.map { \|_, a\| attr_pair(a) }.to_h

	if elem.children.empty?
	attrs
	elsif elem.children.all? { \|elem\| elem.text? \|\| elem.cdata? }
	if attrs.empty?
	elem.text
	else
	attrs.merge('text' => elem.text)
	end
	else
	child_attrs = elem.element_children.map { \|child\| [child.name, elem_to_o(child)] }
	attrs.merge(child_attrs.to_h)
	end
	end

	def attr_pair(attr)
	name = if attr&.namespace&.prefix
	"#{attr.namespace.prefix}:#{attr.name}"
	else
	attr.name
	end
	[name, attr.value]
	end

	def reconstruct_tree!(posts)
	by_id = {}
	roots = []

	posts.each do \|post\|
	id = post.fetch('dsq:id')
	by_id[id] = post

	parent = post['parent']
	if parent
	parent_id = parent.fetch('dsq:id')
	# Docs specify "Parents should always exist before they are referenced"
	# so this lookup should not fail
	parent_post = by_id.fetch(parent_id)

	parent_replies = parent_post['replies'] \|\|= []
	parent_replies << post
	else
	roots << post
	end
	end

	roots
	end

	def embed_children!(parents, children_attr, parent_attr, children)
	parents_by_id = parents
	.group_by { \|parent\| parent.fetch('dsq:id') }
	.transform_values do \|vs\|
	v = vs.fetch(0)
	raise "more than one #{parent_attr} for id #{v.fetch('dsq:id')}" if vs.size > 1

	v
	end

	children.each do \|child\|
	parent_id = child.fetch(parent_attr).fetch('dsq:id')
	parent = parents_by_id.fetch(parent_id)
	parent_children = parent[children_attr] \|\|= []
	parent_children << child
	end

	parents
	end

	GZ_FILE = ARGV[0] or raise 'Please specify .xml.gz input as first argument.'

	Zlib::GzipReader.open(GZ_FILE) do \|xml\|
	doc = Nokogiri::XML::Document.parse(xml)

	first_level_elements = doc.root.element_children.group_by(&:name)
	x_categories, x_threads, x_posts = first_level_elements.values_at('category', 'thread', 'post')

	categories = x_categories.map { \|elem\| elem_to_o(elem) }
	threads = x_threads.map { \|elem\| elem_to_o(elem) }
	posts = x_posts.map { \|elem\| elem_to_o(elem) }

	root_posts = reconstruct_tree!(posts)

	embed_children!(categories, 'threads', 'category', threads)
	embed_children!(threads, 'rootPosts', 'thread', root_posts)

	puts JSON.generate('categories' => categories)
	end