-
-
Save Layoric/d82476a457e1db904a2601f65df53e3e to your computer and use it in GitHub Desktop.
Migrating blogger to Vuepress
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System; | |
using System.Collections.Generic; | |
using System.IO; | |
using System.Linq; | |
using System.Text.Json; | |
using System.Text.RegularExpressions; | |
using ServiceStack; | |
namespace BloggerMigration | |
{ | |
class Program | |
{ | |
private static string mdTemplate = @"--- | |
title: {0} | |
date: {1} | |
{2} | |
--- | |
{3}"; | |
static void Main(string[] args) | |
{ | |
var initialResponse = | |
"https://www.googleapis.com/blogger/v3/blogs/<YourBlogId>/posts?key=<APIKEY>" | |
.GetJsonFromUrl() | |
.FromJson<BloggerPostsResponse>(); | |
var posts = initialResponse.Items; | |
while (initialResponse.NextPageToken != null) | |
{ | |
initialResponse = | |
$"https://www.googleapis.com/blogger/v3/blogs/<YourBlogId>/posts?key=<APIKEY>&pageToken={initialResponse.NextPageToken}" | |
.GetJsonFromUrl() | |
.FromJson<BloggerPostsResponse>(); | |
posts.AddRange(initialResponse.Items); | |
} | |
var uniquePosts = posts.GroupBy(x => x.Id) | |
.Select(x => x.First()) | |
.ToList(); | |
var converter = new ReverseMarkdown.Converter(); | |
foreach (var postItem in uniquePosts) | |
{ | |
var imgDirName = postItem.Url.Split("/").Last().Replace(".html", ""); | |
var images = GetImageLinks(postItem.Content).Where(x => x.Contains("blogspot.com")).ToList(); | |
var imgDirPath = ($"..\\..\\..\\images\\archive\\{imgDirName}\\").MapAbsolutePath(); | |
foreach (var img in images) | |
{ | |
Directory.CreateDirectory(imgDirPath); | |
var scale = img.Split("/")[7]; | |
var bytes = img.Replace($"/{scale}/", "/s10000/").GetBytesFromUrl(); | |
var imgName = img.Split("/")[8]; | |
File.WriteAllBytes((Path.Join(imgDirPath, imgName)).MapAbsolutePath(), bytes); | |
postItem.Content = postItem.Content.Replace(img, $"/images/archive/{imgDirName}/{imgName}"); | |
} | |
// Update links | |
var anchorLinks = GetAnchorLinks(postItem.Content) | |
.Where(x => x.Contains("blogspot.com")) | |
.ToList(); | |
foreach (var link in anchorLinks) | |
{ | |
var imgName = link.Split("/")[8]; | |
postItem.Content = postItem.Content.Replace(link, $"/images/archive/{imgDirName}/{imgName}"); | |
} | |
// HTML -> MarkDown | |
postItem.Content = converter.Convert(postItem.Content); | |
// Replace Gist embed scripts with code fence. | |
var scripts = GetScriptUrls(postItem.Content) | |
.Where(x => x.Contains("gist.github.com")) | |
.ToList(); | |
foreach (var script in scripts) | |
{ | |
var url = script.Substring(0, script.Length - 3); | |
var gistId = url.Split("/").Last(); | |
var gistJson = ("https://api.github.com/gists/" + gistId).GetJsonFromUrl(request => | |
{ | |
request.Headers.Add("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:93.0) Gecko/20100101 Firefox/93.0"); | |
}); | |
var gistDoc = JsonDocument.Parse(gistJson); | |
GistFile gistFile = new GistFile(); | |
foreach (var file in gistDoc.RootElement.GetProperty("files").EnumerateObject()) | |
{ | |
gistFile = file.Value.ToString().FromJson<GistFile>(); | |
} | |
postItem.Content = postItem.Content.Replace($"<script src=\"{script}\"></script>", | |
$@" | |
``` | |
{gistFile.content} | |
``` | |
"); | |
} | |
// Write modified MD | |
var tags = postItem.Labels != null ? "tags:\n- " + postItem.Labels.Join("\n- ") : ""; | |
var mdOutput = mdTemplate.Fmt(postItem.Title, postItem.Published.ToString("yyyy-MM-dd"), | |
tags, | |
postItem.Content); | |
var fileName = postItem.Url.Split("/").Last().Replace(".html", ".md"); | |
File.WriteAllText(("..\\..\\..\\_archive\\" + fileName).MapAbsolutePath(), mdOutput); | |
} | |
} | |
static IEnumerable<String> GetScriptUrls(String inputHTML) | |
{ | |
const string pattern = @"<script\b[^\<\>]+?\bsrc\s*=\s*[""'](?<L>.+?)[""'][^\<\>]*?\>"; | |
foreach (Match match in Regex.Matches(inputHTML, pattern, RegexOptions.IgnoreCase)) | |
{ | |
var imageLink = match.Groups["L"].Value; | |
yield return imageLink; | |
} | |
} | |
static IEnumerable<String> GetImageLinks(String inputHTML) | |
{ | |
const string pattern = @"<img\b[^\<\>]+?\bsrc\s*=\s*[""'](?<L>.+?)[""'][^\<\>]*?\>"; | |
foreach (Match match in Regex.Matches(inputHTML, pattern, RegexOptions.IgnoreCase)) | |
{ | |
var imageLink = match.Groups["L"].Value; | |
yield return imageLink; | |
} | |
} | |
static IEnumerable<String> GetAnchorLinks(String inputHTML) | |
{ | |
const string pattern = @"<a\b[^\<\>]+?\bhref\s*=\s*[""'](?<L>.+?)[""'][^\<\>]*?\>"; | |
foreach (Match match in Regex.Matches(inputHTML, pattern, RegexOptions.IgnoreCase)) | |
{ | |
var anchorLinks = match.Groups["L"].Value; | |
yield return anchorLinks; | |
} | |
} | |
} | |
public class BloggerPostsResponse | |
{ | |
public string NextPageToken { get; set; } | |
public List<PostItem> Items { get; set; } | |
} | |
public class PostItem | |
{ | |
public string Id { get; set; } | |
public DateTime Published { get; set; } | |
public string Url { get; set; } | |
public string Title { get; set; } | |
public string Content { get; set; } | |
public List<string> Labels { get; set; } | |
} | |
public class GistFile | |
{ | |
public string filename { get; set; } | |
public string type { get; set; } | |
public string content { get; set; } | |
public string raw_url { get; set; } | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment