Skip to content

Instantly share code, notes, and snippets.

@n0099
Last active July 31, 2022 22:13
Show Gist options
  • Save n0099/52fb3ba66f30e63b7afb8bd97fc9553f to your computer and use it in GitHub Desktop.
Save n0099/52fb3ba66f30e63b7afb8bd97fc9553f to your computer and use it in GitHub Desktop.
using System.Text.Encodings.Web;
using System.Text.RegularExpressions;
namespace tbm.Crawler
{
public class MigrateWorker : BackgroundService
{
private readonly ILogger<MigrateWorker> _logger;
private readonly ILifetimeScope _scope;
public MigrateWorker(ILogger<MigrateWorker> logger, ILifetimeScope scope)
{
_logger = logger;
_scope = scope;
}
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
{
await using var scope = _scope.BeginLifetimeScope();
var db = scope.Resolve<TbmDbContext.New>()(0);
var fids = from f in db.ForumsInfo select f.Fid;
foreach (var fid in fids)
{
_logger.LogInformation("converting for fid:{} started", fid);
await Convert4(fid);
_logger.LogInformation("converting for fid:{} finished", fid);
}
await Convert5(0);
Environment.Exit(0);
}
private static readonly JsonSerializerOptions JsonSerializerOptions = new() { IncludeFields = true, Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping};
private static readonly Regex ImgUrlExtractingRegex = new(@"^https?://(tiebapic|imgsrc)\.baidu\.com/forum/pic/item/(?<hash>.*?)\.jpg(\?.*)*$", RegexOptions.Compiled, TimeSpan.FromSeconds(1));
public class ReplyImages
{
public ulong Pid { get; set; }
public string HashOrUrl { get; set; } = "";
public uint BsizeWidth { get; set; }
public uint BsizeHeight { get; set; }
public uint ByteSize { get; set; }
}
private async Task Convert5(Fid fid)
{
await using var scope = _scope.BeginLifetimeScope();
var db = scope.Resolve<TbmDbContext.New>()(fid);
var db2 = scope.Resolve<TbmDbContext.New>()(fid);
var replies = from r in db.ReplyContents where r.Content != null select r;
var i = 0;
using var process = Process.GetCurrentProcess();
var stopwatch = new Stopwatch();
stopwatch.Start();
var exceptions = new Dictionary<string, (uint times, ulong pid)>();
var newReplies = new Dictionary<(ulong Pid, string HashOrUrl), ReplyImages>();
void SaveAndLog()
{
db2.Set<ReplyImages>().AddRange(newReplies.Values);
_ = db2.SaveChanges();
newReplies.Clear();
db2.ChangeTracker.Clear();
_logger.LogTrace("i:{} elapsed:{}ms mem:{}kb exceptions:{}", i,
stopwatch.ElapsedMilliseconds,
process.PrivateMemorySize64 / 1024,
JsonSerializer.Serialize(exceptions, JsonSerializerOptions));
stopwatch.Restart();
}
foreach (var reply in replies.AsNoTracking())
{
i++;
if (i % 50000 == 0) SaveAndLog();
try
{
var contents = PostContentWrapper.Parser.ParseFrom(reply.Content);
foreach (var content in contents.Value.Where(content => content.Type == 3))
{
var bsizes = content.Bsize.Split(',');
_ = newReplies.TryAdd((reply.Pid, content.OriginSrc), new()
{
Pid = reply.Pid,
HashOrUrl = content.OriginSrc,
BsizeWidth = uint.Parse(bsizes[0]),
BsizeHeight = uint.Parse(bsizes[1]),
ByteSize = content.OriginSize
});
}
}
catch (Exception e)
{
var em = e.GetType().FullName + ": " + e.Message;
if (!exceptions.TryAdd(em, (1, reply.Pid)))
{
var a = exceptions[em];
a.times++;
a.pid = reply.Pid;
exceptions[em] = a;
}
// _logger.LogError(e, "exception spid:{} content:{}", subReply.Spid, subReply.Content);
}
}
SaveAndLog();
}
private async Task Convert4(Fid fid)
{
await using var scope = _scope.BeginLifetimeScope();
var db = scope.Resolve<TbmDbContext.New>()(fid);
var db2 = scope.Resolve<TbmDbContext.New>()(fid);
var replies = from r in db.ReplyContents where r.Content != null select r;
var i = 0;
using var process = Process.GetCurrentProcess();
var stopwatch = new Stopwatch();
stopwatch.Start();
var exceptions = new Dictionary<string, (uint times, ulong pid)>();
var newReplies = new List<ReplyContent>();
var regexMatchesCount = new Dictionary<string, Dictionary<string, uint>>
{["domain"] = new(), ["hash"] = new(), ["ext"] = new()};
void SaveAndLog()
{
db2.Set<ReplyContent>().AttachRange(newReplies);
foreach (var e in db2.ChangeTracker.Entries<ReplyContent>())
{
e.Property(r => r.Content).IsModified = true;
}
_ = db2.SaveChanges();
newReplies.Clear();
db2.ChangeTracker.Clear();
_logger.LogTrace("i:{} elapsed:{}ms mem:{}kb exceptions:{}", i,
stopwatch.ElapsedMilliseconds,
process.PrivateMemorySize64 / 1024,
JsonSerializer.Serialize(exceptions, JsonSerializerOptions));
stopwatch.Restart();
}
foreach (var reply in replies.AsNoTracking())
{
i++;
if (i % 50000 == 0) SaveAndLog();
try
{
var contents = PostContentWrapper.Parser.ParseFrom(reply.Content);
var isModified = false;
foreach (var content in contents.Value.Where(content => content.Type == 3))
{
if (ImgUrlExtractingRegex.Match(content.OriginSrc) is not {Success: true} matches) continue;
content.OriginSrc = matches.Groups["hash"].Value;
isModified = true;
/*
void UpdateOrAddCounts(string key1)
{
var value = matches.Groups[key1].Value;
if (!regexMatchesCount[key1].TryAdd(value, 1))
{
regexMatchesCount[key1][value]++;
}
}
UpdateOrAddCounts("domain");
UpdateOrAddCounts("hash");
UpdateOrAddCounts("ext");
*/
}
if (isModified)
{
newReplies.Add(new()
{
Pid = reply.Pid,
Content = contents.ToByteArray()
});
}
}
catch (Exception e)
{
var em = e.GetType().FullName + ": " + e.Message;
if (!exceptions.TryAdd(em, (1, reply.Pid)))
{
var a = exceptions[em];
a.times++;
a.pid = reply.Pid;
exceptions[em] = a;
}
// _logger.LogError(e, "exception spid:{} content:{}", subReply.Spid, subReply.Content);
}
}
SaveAndLog();
// _logger.LogInformation("{}", JsonSerializer.Serialize(regexMatchesCount));
}
}
}
CREATE TABLE `tbm_f0_replies_img` (
`id` int unsigned NOT NULL AUTO_INCREMENT,
`pid` bigint unsigned NOT NULL,
`hashOrUrl` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
`bsizeWidth` int unsigned DEFAULT NULL,
`bsizeHeight` int unsigned DEFAULT NULL,
`byteSize` int unsigned DEFAULT NULL,
PRIMARY KEY (`id`),
UNIQUE KEY `pid` (`pid`,`hashOrUrl`(766)) USING BTREE
);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment