Skip to content

Instantly share code, notes, and snippets.

@rosszurowski
Last active March 10, 2022 17:14
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save rosszurowski/967282fdc8a1e1d20a3919fdb1fce796 to your computer and use it in GitHub Desktop.
Save rosszurowski/967282fdc8a1e1d20a3919fdb1fce796 to your computer and use it in GitHub Desktop.
Download images from a Squarespace export file

Squarespace Image Exporter

This is a simple script to download all images from a Squarespace export file. Squarespace only supports exporting your content to a WordPress-compatible XML file, but doesn't give you any of the image content to accompany it. This script downloads the highest quality version of the images from the export to a local folder.

Using

You'll need Go installed. The easiest way to get it is with:

brew install go

Once that's done, copy and paste the script into a folder and name it export.go.

Navigate to that folder from the command-line and run:

go run export.go --export-file=your-export.xml

You can optionally specify --output-dir and --concurrency (defaults to 5 requests at a time).

package main
import (
"context"
"encoding/xml"
"flag"
"fmt"
"io"
"log"
"net/http"
"os"
"path/filepath"
"strings"
"sync"
"sync/atomic"
)
var (
exportFile = flag.String("export-file", "", "squarespace export file")
outputDir = flag.String("output-dir", "./output/", "output directory")
concurrency = flag.Int("concurrency", 5, "number of concurrent downloads")
)
func main() {
flag.Parse()
if *exportFile == "" {
log.Fatal("export file not specified, use --export-file")
}
b, err := os.ReadFile(*exportFile)
if err != nil {
log.Fatalf("failed to read file: %s", err)
}
urls, err := parseAttachmentURLs(b)
if err != nil {
log.Fatalf("failed to parse export file: %s", err)
}
out := filepath.Clean(*outputDir)
downloadAttachments(context.Background(), urls, out, *concurrency)
}
type ExportData struct {
Items []struct {
AttachmentURL string `xml:"http://wordpress.org/export/1.2/ attachment_url"`
PostType string `xml:"http://wordpress.org/export/1.2/ post_type"`
PostName string `xml:"http://wordpress.org/export/1.2/ post_name"`
Title string `xml:"title"`
} `xml:"channel>item"`
}
// parseAttachmentURLs parses the export file and returns a list of attachment URLs.
func parseAttachmentURLs(b []byte) ([]string, error) {
var export ExportData
err := xml.Unmarshal(b, &export)
if err != nil {
return nil, err
}
var urls []string
for _, item := range export.Items {
if item.PostType != "attachment" {
continue
}
if !isDownloadableAttachment(item.AttachmentURL) {
fmt.Println("skipping invalid attachemnt:", item.AttachmentURL)
continue
}
urls = append(urls, item.AttachmentURL)
}
// deduplicate urls that are the same, but only have different protocols
seen := make(map[string]bool)
var dedupedUrls []string
for _, url := range urls {
urlWithoutProtocol := strings.TrimPrefix(strings.TrimPrefix(url, "http://"), "https://")
if found := seen[urlWithoutProtocol]; !found {
seen[urlWithoutProtocol] = true
dedupedUrls = append(dedupedUrls, "https://"+urlWithoutProtocol)
}
}
return dedupedUrls, nil
}
func isDownloadableAttachment(url string) bool {
lowercaseURL := strings.ToLower(url)
return strings.HasSuffix(lowercaseURL, ".jpg") ||
strings.HasSuffix(lowercaseURL, ".jpeg") ||
strings.HasSuffix(lowercaseURL, ".png") ||
strings.HasSuffix(lowercaseURL, ".gif") ||
strings.HasSuffix(lowercaseURL, ".svg") ||
strings.HasSuffix(lowercaseURL, ".webp") ||
strings.HasSuffix(lowercaseURL, ".mp3") ||
strings.HasSuffix(lowercaseURL, ".pdf")
}
func downloadAttachments(ctx context.Context, urls []string, outputDir string, concurrency int) {
_, err := os.Stat(outputDir)
if os.IsNotExist(err) {
err = os.MkdirAll(outputDir, os.ModePerm)
if err != nil {
log.Fatalf("failed to create output directory: %s", err)
}
} else if err != nil {
log.Fatalf("failed to stat output dir: %s", err)
}
semaphore := make(chan struct{}, concurrency)
var count int32
var wg sync.WaitGroup
wg.Add(len(urls))
for _, url := range urls {
go func(url string) {
semaphore <- struct{}{}
defer wg.Done()
log.Printf("downloading .../%s", trimURL(url))
err := downloadAttachment(ctx, url, outputDir)
if err != nil {
log.Printf("failed to download attachment %q: %s", url, err)
} else {
atomic.AddInt32(&count, 1)
}
<-semaphore
}(url)
}
wg.Wait()
log.Printf("downloaded %d/%d attachment", count, len(urls))
}
// downloadAttachment downloads an attachment from the given URL and saves it to the given output directory.
func downloadAttachment(ctx context.Context, attachmentURL string, outputDir string) error {
r, err := http.NewRequestWithContext(ctx, "GET", attachmentURL, nil)
if err != nil {
return fmt.Errorf("couldn't create request: %s", err)
}
resp, err := http.DefaultClient.Do(r)
if err != nil {
return fmt.Errorf("couldn't fetch: %s", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return fmt.Errorf("bad response: %s", resp.Status)
}
// Attachments can have duplicate filenames, so we include the UUIDs in the URL
// as part of the filename too.
filename := getFilename(attachmentURL)
file := filepath.Join(outputDir, filename)
_, err = os.Stat(file)
if !os.IsNotExist(err) {
return fmt.Errorf("file already exists: %s", file)
}
f, err := os.Create(file)
if err != nil {
return fmt.Errorf("couldn't create file: %s", err)
}
defer f.Close()
_, err = io.Copy(f, resp.Body)
if err != nil {
return fmt.Errorf("couldn't write attachment: %s", err)
}
return nil
}
var slashToDash = strings.NewReplacer("/", "-")
func getFilename(url string) string {
return slashToDash.Replace(trimURL(url))
}
func trimURL(url string) string {
return strings.TrimPrefix(strings.TrimPrefix(url, "https://images.squarespace-cdn.com/content/v1/"), "https://static1.squarespace.com/static/")
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment