Skip to content

Instantly share code, notes, and snippets.

@mcmonkey4eva
Last active January 27, 2023 10:27
Show Gist options
  • Save mcmonkey4eva/176fa6fc9a121cd19117674dcd26b26e to your computer and use it in GitHub Desktop.
Save mcmonkey4eva/176fa6fc9a121cd19117674dcd26b26e to your computer and use it in GitHub Desktop.
Lexica scraper

lexica.art scraper

  • 1: open https://lexica.art/
  • 2: open browser console
  • 3: paste in the JavaScript, hit enter
  • 4: expand the output and copy it to a text file
  • 5: optionally, repeat with multiple sets of pages
  • 6: Save the .cs and .csproj files together in a folder, open the .csproj in Visual Studio
  • 7: Edit the run(... inputs to your file paths
  • 8: Run it
  • 9: enjoy! Train a model off the content or something
// Javascript for your console
var out = "";
for (var elem of document.getElementsByTagName('div')) {
if (elem.attributes[0] !== undefined && elem.attributes[0].nodeValue != 'gridcell') {
continue;
}
var aElem = elem.getElementsByTagName('a')[0];
if (!aElem.href.startsWith('https://lexica.art/prompt/')) {
continue;
}
out += aElem.href;
var img = elem.getElementsByTagName('img')[0];
out += " " + img.src;
for (var pElem of elem.getElementsByTagName('p')) {
out += " " + pElem.innerText;
}
out += "\n";
}
console.log(out);
using System;
using System.Collections.Generic;
using System.Linq;
using System.Net.Http;
using System.IO;
using System.Drawing;
using System.Drawing.Imaging;
using System.Threading.Tasks;
using FreneticUtilities.FreneticExtensions;
using FreneticUtilities.FreneticToolkit;
Run(
// PATH TO TEXT FILE HERE:
"./my-file.txt",
// PATH TO OUT DIR HERE:
"./out/"
);
///////////
static void Run(string fileName, string outFolder)
{
AsciiMatcher hex = new(AsciiMatcher.Digits + "abcdefABCDEF");
HttpClient client = new();
client.DefaultRequestHeaders.UserAgent.ParseAdd("Scrapey/1.0");
string[] set = File.ReadAllText(fileName).Replace('\r', '\n').Split('\n', StringSplitOptions.RemoveEmptyEntries).Distinct().ToArray();
int count = 0;
foreach (string url in set)
{
try
{
Console.WriteLine($"Load {count++} / {set.Length}...");
string[] opts = url.Split(' ', 3);
if (opts.Length != 3)
{
Console.WriteLine($"Ignore {url} as invalid");
continue;
}
string prompt = opts[2];
if (prompt.EndsWith("..."))
{
prompt = prompt.BeforeLast(' ');
}
byte[] imageData = client.GetByteArrayAsync(opts[1]).Result;
if (imageData is null)
{
Console.WriteLine($"Couldn't load image {url}");
Task.Delay(1000).Wait();
continue;
}
string uuid = hex.TrimToMatches(opts[1].AfterLast('/'));
using Image image = Image.FromStream(new MemoryStream(imageData));
int wider = Math.Max(image.Width, image.Height);
float scale = wider > 768 ? 768f / wider : 1;
Bitmap clone = new Bitmap((int)(image.Width * scale), (int)(image.Height * scale), PixelFormat.Format24bppRgb);
using (Graphics gr = Graphics.FromImage(clone))
{
gr.DrawImage(image, new Rectangle(0, 0, clone.Width, clone.Height));
}
clone.Save(outFolder + uuid + ".png", ImageFormat.Png);
File.WriteAllText(outFolder + uuid + ".txt", prompt);
}
catch (Exception ex)
{
Console.WriteLine($"Couldn't load page {url} because error {ex}");
Task.Delay(1000).Wait();
continue;
}
}
Console.WriteLine("Done!");
}
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0-windows</TargetFramework>
</PropertyGroup>
<ItemGroup>
<PackageReference Include="FreneticLLC.FreneticUtilities" Version="1.0.1" />
<PackageReference Include="System.Drawing.Common" Version="7.0.0" />
</ItemGroup>
</Project>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment