|
package main |
|
|
|
import ( |
|
"context" |
|
"encoding/xml" |
|
"flag" |
|
"fmt" |
|
"io" |
|
"log" |
|
"net/http" |
|
"os" |
|
"path/filepath" |
|
"strings" |
|
"sync" |
|
"sync/atomic" |
|
) |
|
|
|
var ( |
|
exportFile = flag.String("export-file", "", "squarespace export file") |
|
outputDir = flag.String("output-dir", "./output/", "output directory") |
|
concurrency = flag.Int("concurrency", 5, "number of concurrent downloads") |
|
) |
|
|
|
func main() { |
|
flag.Parse() |
|
|
|
if *exportFile == "" { |
|
log.Fatal("export file not specified, use --export-file") |
|
} |
|
|
|
b, err := os.ReadFile(*exportFile) |
|
if err != nil { |
|
log.Fatalf("failed to read file: %s", err) |
|
} |
|
|
|
urls, err := parseAttachmentURLs(b) |
|
if err != nil { |
|
log.Fatalf("failed to parse export file: %s", err) |
|
} |
|
|
|
out := filepath.Clean(*outputDir) |
|
downloadAttachments(context.Background(), urls, out, *concurrency) |
|
} |
|
|
|
type ExportData struct { |
|
Items []struct { |
|
AttachmentURL string `xml:"http://wordpress.org/export/1.2/ attachment_url"` |
|
PostType string `xml:"http://wordpress.org/export/1.2/ post_type"` |
|
PostName string `xml:"http://wordpress.org/export/1.2/ post_name"` |
|
Title string `xml:"title"` |
|
} `xml:"channel>item"` |
|
} |
|
|
|
// parseAttachmentURLs parses the export file and returns a list of attachment URLs. |
|
func parseAttachmentURLs(b []byte) ([]string, error) { |
|
var export ExportData |
|
err := xml.Unmarshal(b, &export) |
|
if err != nil { |
|
return nil, err |
|
} |
|
var urls []string |
|
for _, item := range export.Items { |
|
if item.PostType != "attachment" { |
|
continue |
|
} |
|
if !isDownloadableAttachment(item.AttachmentURL) { |
|
fmt.Println("skipping invalid attachemnt:", item.AttachmentURL) |
|
continue |
|
} |
|
urls = append(urls, item.AttachmentURL) |
|
} |
|
// deduplicate urls that are the same, but only have different protocols |
|
seen := make(map[string]bool) |
|
var dedupedUrls []string |
|
for _, url := range urls { |
|
urlWithoutProtocol := strings.TrimPrefix(strings.TrimPrefix(url, "http://"), "https://") |
|
if found := seen[urlWithoutProtocol]; !found { |
|
seen[urlWithoutProtocol] = true |
|
dedupedUrls = append(dedupedUrls, "https://"+urlWithoutProtocol) |
|
} |
|
} |
|
|
|
return dedupedUrls, nil |
|
} |
|
|
|
func isDownloadableAttachment(url string) bool { |
|
lowercaseURL := strings.ToLower(url) |
|
return strings.HasSuffix(lowercaseURL, ".jpg") || |
|
strings.HasSuffix(lowercaseURL, ".jpeg") || |
|
strings.HasSuffix(lowercaseURL, ".png") || |
|
strings.HasSuffix(lowercaseURL, ".gif") || |
|
strings.HasSuffix(lowercaseURL, ".svg") || |
|
strings.HasSuffix(lowercaseURL, ".webp") || |
|
strings.HasSuffix(lowercaseURL, ".mp3") || |
|
strings.HasSuffix(lowercaseURL, ".pdf") |
|
} |
|
|
|
func downloadAttachments(ctx context.Context, urls []string, outputDir string, concurrency int) { |
|
_, err := os.Stat(outputDir) |
|
if os.IsNotExist(err) { |
|
err = os.MkdirAll(outputDir, os.ModePerm) |
|
if err != nil { |
|
log.Fatalf("failed to create output directory: %s", err) |
|
} |
|
} else if err != nil { |
|
log.Fatalf("failed to stat output dir: %s", err) |
|
} |
|
|
|
semaphore := make(chan struct{}, concurrency) |
|
|
|
var count int32 |
|
var wg sync.WaitGroup |
|
wg.Add(len(urls)) |
|
for _, url := range urls { |
|
go func(url string) { |
|
semaphore <- struct{}{} |
|
defer wg.Done() |
|
log.Printf("downloading .../%s", trimURL(url)) |
|
err := downloadAttachment(ctx, url, outputDir) |
|
if err != nil { |
|
log.Printf("failed to download attachment %q: %s", url, err) |
|
} else { |
|
atomic.AddInt32(&count, 1) |
|
} |
|
<-semaphore |
|
}(url) |
|
} |
|
wg.Wait() |
|
log.Printf("downloaded %d/%d attachment", count, len(urls)) |
|
} |
|
|
|
// downloadAttachment downloads an attachment from the given URL and saves it to the given output directory. |
|
func downloadAttachment(ctx context.Context, attachmentURL string, outputDir string) error { |
|
r, err := http.NewRequestWithContext(ctx, "GET", attachmentURL, nil) |
|
if err != nil { |
|
return fmt.Errorf("couldn't create request: %s", err) |
|
} |
|
resp, err := http.DefaultClient.Do(r) |
|
if err != nil { |
|
return fmt.Errorf("couldn't fetch: %s", err) |
|
} |
|
defer resp.Body.Close() |
|
if resp.StatusCode != http.StatusOK { |
|
return fmt.Errorf("bad response: %s", resp.Status) |
|
} |
|
// Attachments can have duplicate filenames, so we include the UUIDs in the URL |
|
// as part of the filename too. |
|
filename := getFilename(attachmentURL) |
|
file := filepath.Join(outputDir, filename) |
|
|
|
_, err = os.Stat(file) |
|
if !os.IsNotExist(err) { |
|
return fmt.Errorf("file already exists: %s", file) |
|
} |
|
|
|
f, err := os.Create(file) |
|
if err != nil { |
|
return fmt.Errorf("couldn't create file: %s", err) |
|
} |
|
defer f.Close() |
|
_, err = io.Copy(f, resp.Body) |
|
if err != nil { |
|
return fmt.Errorf("couldn't write attachment: %s", err) |
|
} |
|
return nil |
|
} |
|
|
|
var slashToDash = strings.NewReplacer("/", "-") |
|
|
|
func getFilename(url string) string { |
|
return slashToDash.Replace(trimURL(url)) |
|
} |
|
|
|
func trimURL(url string) string { |
|
return strings.TrimPrefix(strings.TrimPrefix(url, "https://images.squarespace-cdn.com/content/v1/"), "https://static1.squarespace.com/static/") |
|
} |