Skip to content

Instantly share code, notes, and snippets.

@Layoric
Created October 24, 2021 06:40
Show Gist options
  • Save Layoric/d82476a457e1db904a2601f65df53e3e to your computer and use it in GitHub Desktop.
Save Layoric/d82476a457e1db904a2601f65df53e3e to your computer and use it in GitHub Desktop.
Migrating blogger to Vuepress
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.Json;
using System.Text.RegularExpressions;
using ServiceStack;
namespace BloggerMigration
{
class Program
{
private static string mdTemplate = @"---
title: {0}
date: {1}
{2}
---
{3}";
static void Main(string[] args)
{
var initialResponse =
"https://www.googleapis.com/blogger/v3/blogs/<YourBlogId>/posts?key=<APIKEY>"
.GetJsonFromUrl()
.FromJson<BloggerPostsResponse>();
var posts = initialResponse.Items;
while (initialResponse.NextPageToken != null)
{
initialResponse =
$"https://www.googleapis.com/blogger/v3/blogs/<YourBlogId>/posts?key=<APIKEY>&pageToken={initialResponse.NextPageToken}"
.GetJsonFromUrl()
.FromJson<BloggerPostsResponse>();
posts.AddRange(initialResponse.Items);
}
var uniquePosts = posts.GroupBy(x => x.Id)
.Select(x => x.First())
.ToList();
var converter = new ReverseMarkdown.Converter();
foreach (var postItem in uniquePosts)
{
var imgDirName = postItem.Url.Split("/").Last().Replace(".html", "");
var images = GetImageLinks(postItem.Content).Where(x => x.Contains("blogspot.com")).ToList();
var imgDirPath = ($"..\\..\\..\\images\\archive\\{imgDirName}\\").MapAbsolutePath();
foreach (var img in images)
{
Directory.CreateDirectory(imgDirPath);
var scale = img.Split("/")[7];
var bytes = img.Replace($"/{scale}/", "/s10000/").GetBytesFromUrl();
var imgName = img.Split("/")[8];
File.WriteAllBytes((Path.Join(imgDirPath, imgName)).MapAbsolutePath(), bytes);
postItem.Content = postItem.Content.Replace(img, $"/images/archive/{imgDirName}/{imgName}");
}
// Update links
var anchorLinks = GetAnchorLinks(postItem.Content)
.Where(x => x.Contains("blogspot.com"))
.ToList();
foreach (var link in anchorLinks)
{
var imgName = link.Split("/")[8];
postItem.Content = postItem.Content.Replace(link, $"/images/archive/{imgDirName}/{imgName}");
}
// HTML -> MarkDown
postItem.Content = converter.Convert(postItem.Content);
// Replace Gist embed scripts with code fence.
var scripts = GetScriptUrls(postItem.Content)
.Where(x => x.Contains("gist.github.com"))
.ToList();
foreach (var script in scripts)
{
var url = script.Substring(0, script.Length - 3);
var gistId = url.Split("/").Last();
var gistJson = ("https://api.github.com/gists/" + gistId).GetJsonFromUrl(request =>
{
request.Headers.Add("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0");
});
var gistDoc = JsonDocument.Parse(gistJson);
GistFile gistFile = new GistFile();
foreach (var file in gistDoc.RootElement.GetProperty("files").EnumerateObject())
{
gistFile = file.Value.ToString().FromJson<GistFile>();
}
postItem.Content = postItem.Content.Replace($"<script src=\"{script}\"></script>",
$@"
```
{gistFile.content}
```
");
}
// Write modified MD
var tags = postItem.Labels != null ? "tags:\n- " + postItem.Labels.Join("\n- ") : "";
var mdOutput = mdTemplate.Fmt(postItem.Title, postItem.Published.ToString("yyyy-MM-dd"),
tags,
postItem.Content);
var fileName = postItem.Url.Split("/").Last().Replace(".html", ".md");
File.WriteAllText(("..\\..\\..\\_archive\\" + fileName).MapAbsolutePath(), mdOutput);
}
}
static IEnumerable<String> GetScriptUrls(String inputHTML)
{
const string pattern = @"<script\b[^\<\>]+?\bsrc\s*=\s*[""'](?<L>.+?)[""'][^\<\>]*?\>";
foreach (Match match in Regex.Matches(inputHTML, pattern, RegexOptions.IgnoreCase))
{
var imageLink = match.Groups["L"].Value;
yield return imageLink;
}
}
static IEnumerable<String> GetImageLinks(String inputHTML)
{
const string pattern = @"<img\b[^\<\>]+?\bsrc\s*=\s*[""'](?<L>.+?)[""'][^\<\>]*?\>";
foreach (Match match in Regex.Matches(inputHTML, pattern, RegexOptions.IgnoreCase))
{
var imageLink = match.Groups["L"].Value;
yield return imageLink;
}
}
static IEnumerable<String> GetAnchorLinks(String inputHTML)
{
const string pattern = @"<a\b[^\<\>]+?\bhref\s*=\s*[""'](?<L>.+?)[""'][^\<\>]*?\>";
foreach (Match match in Regex.Matches(inputHTML, pattern, RegexOptions.IgnoreCase))
{
var anchorLinks = match.Groups["L"].Value;
yield return anchorLinks;
}
}
}
public class BloggerPostsResponse
{
public string NextPageToken { get; set; }
public List<PostItem> Items { get; set; }
}
public class PostItem
{
public string Id { get; set; }
public DateTime Published { get; set; }
public string Url { get; set; }
public string Title { get; set; }
public string Content { get; set; }
public List<string> Labels { get; set; }
}
public class GistFile
{
public string filename { get; set; }
public string type { get; set; }
public string content { get; set; }
public string raw_url { get; set; }
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment