Last active
July 31, 2022 22:13
-
-
Save n0099/52fb3ba66f30e63b7afb8bd97fc9553f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Text.Encodings.Web; | |
using System.Text.RegularExpressions; | |
namespace tbm.Crawler | |
{ | |
public class MigrateWorker : BackgroundService | |
{ | |
private readonly ILogger<MigrateWorker> _logger; | |
private readonly ILifetimeScope _scope; | |
public MigrateWorker(ILogger<MigrateWorker> logger, ILifetimeScope scope) | |
{ | |
_logger = logger; | |
_scope = scope; | |
} | |
protected override async Task ExecuteAsync(CancellationToken stoppingToken) | |
{ | |
await using var scope = _scope.BeginLifetimeScope(); | |
var db = scope.Resolve<TbmDbContext.New>()(0); | |
var fids = from f in db.ForumsInfo select f.Fid; | |
foreach (var fid in fids) | |
{ | |
_logger.LogInformation("converting for fid:{} started", fid); | |
await Convert4(fid); | |
_logger.LogInformation("converting for fid:{} finished", fid); | |
} | |
await Convert5(0); | |
Environment.Exit(0); | |
} | |
private static readonly JsonSerializerOptions JsonSerializerOptions = new() { IncludeFields = true, Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping}; | |
private static readonly Regex ImgUrlExtractingRegex = new(@"^https?://(tiebapic|imgsrc)\.baidu\.com/forum/pic/item/(?<hash>.*?)\.jpg(\?.*)*$", RegexOptions.Compiled, TimeSpan.FromSeconds(1)); | |
public class ReplyImages | |
{ | |
public ulong Pid { get; set; } | |
public string HashOrUrl { get; set; } = ""; | |
public uint BsizeWidth { get; set; } | |
public uint BsizeHeight { get; set; } | |
public uint ByteSize { get; set; } | |
} | |
private async Task Convert5(Fid fid) | |
{ | |
await using var scope = _scope.BeginLifetimeScope(); | |
var db = scope.Resolve<TbmDbContext.New>()(fid); | |
var db2 = scope.Resolve<TbmDbContext.New>()(fid); | |
var replies = from r in db.ReplyContents where r.Content != null select r; | |
var i = 0; | |
using var process = Process.GetCurrentProcess(); | |
var stopwatch = new Stopwatch(); | |
stopwatch.Start(); | |
var exceptions = new Dictionary<string, (uint times, ulong pid)>(); | |
var newReplies = new Dictionary<(ulong Pid, string HashOrUrl), ReplyImages>(); | |
void SaveAndLog() | |
{ | |
db2.Set<ReplyImages>().AddRange(newReplies.Values); | |
_ = db2.SaveChanges(); | |
newReplies.Clear(); | |
db2.ChangeTracker.Clear(); | |
_logger.LogTrace("i:{} elapsed:{}ms mem:{}kb exceptions:{}", i, | |
stopwatch.ElapsedMilliseconds, | |
process.PrivateMemorySize64 / 1024, | |
JsonSerializer.Serialize(exceptions, JsonSerializerOptions)); | |
stopwatch.Restart(); | |
} | |
foreach (var reply in replies.AsNoTracking()) | |
{ | |
i++; | |
if (i % 50000 == 0) SaveAndLog(); | |
try | |
{ | |
var contents = PostContentWrapper.Parser.ParseFrom(reply.Content); | |
foreach (var content in contents.Value.Where(content => content.Type == 3)) | |
{ | |
var bsizes = content.Bsize.Split(','); | |
_ = newReplies.TryAdd((reply.Pid, content.OriginSrc), new() | |
{ | |
Pid = reply.Pid, | |
HashOrUrl = content.OriginSrc, | |
BsizeWidth = uint.Parse(bsizes[0]), | |
BsizeHeight = uint.Parse(bsizes[1]), | |
ByteSize = content.OriginSize | |
}); | |
} | |
} | |
catch (Exception e) | |
{ | |
var em = e.GetType().FullName + ": " + e.Message; | |
if (!exceptions.TryAdd(em, (1, reply.Pid))) | |
{ | |
var a = exceptions[em]; | |
a.times++; | |
a.pid = reply.Pid; | |
exceptions[em] = a; | |
} | |
// _logger.LogError(e, "exception spid:{} content:{}", subReply.Spid, subReply.Content); | |
} | |
} | |
SaveAndLog(); | |
} | |
private async Task Convert4(Fid fid) | |
{ | |
await using var scope = _scope.BeginLifetimeScope(); | |
var db = scope.Resolve<TbmDbContext.New>()(fid); | |
var db2 = scope.Resolve<TbmDbContext.New>()(fid); | |
var replies = from r in db.ReplyContents where r.Content != null select r; | |
var i = 0; | |
using var process = Process.GetCurrentProcess(); | |
var stopwatch = new Stopwatch(); | |
stopwatch.Start(); | |
var exceptions = new Dictionary<string, (uint times, ulong pid)>(); | |
var newReplies = new List<ReplyContent>(); | |
var regexMatchesCount = new Dictionary<string, Dictionary<string, uint>> | |
{["domain"] = new(), ["hash"] = new(), ["ext"] = new()}; | |
void SaveAndLog() | |
{ | |
db2.Set<ReplyContent>().AttachRange(newReplies); | |
foreach (var e in db2.ChangeTracker.Entries<ReplyContent>()) | |
{ | |
e.Property(r => r.Content).IsModified = true; | |
} | |
_ = db2.SaveChanges(); | |
newReplies.Clear(); | |
db2.ChangeTracker.Clear(); | |
_logger.LogTrace("i:{} elapsed:{}ms mem:{}kb exceptions:{}", i, | |
stopwatch.ElapsedMilliseconds, | |
process.PrivateMemorySize64 / 1024, | |
JsonSerializer.Serialize(exceptions, JsonSerializerOptions)); | |
stopwatch.Restart(); | |
} | |
foreach (var reply in replies.AsNoTracking()) | |
{ | |
i++; | |
if (i % 50000 == 0) SaveAndLog(); | |
try | |
{ | |
var contents = PostContentWrapper.Parser.ParseFrom(reply.Content); | |
var isModified = false; | |
foreach (var content in contents.Value.Where(content => content.Type == 3)) | |
{ | |
if (ImgUrlExtractingRegex.Match(content.OriginSrc) is not {Success: true} matches) continue; | |
content.OriginSrc = matches.Groups["hash"].Value; | |
isModified = true; | |
/* | |
void UpdateOrAddCounts(string key1) | |
{ | |
var value = matches.Groups[key1].Value; | |
if (!regexMatchesCount[key1].TryAdd(value, 1)) | |
{ | |
regexMatchesCount[key1][value]++; | |
} | |
} | |
UpdateOrAddCounts("domain"); | |
UpdateOrAddCounts("hash"); | |
UpdateOrAddCounts("ext"); | |
*/ | |
} | |
if (isModified) | |
{ | |
newReplies.Add(new() | |
{ | |
Pid = reply.Pid, | |
Content = contents.ToByteArray() | |
}); | |
} | |
} | |
catch (Exception e) | |
{ | |
var em = e.GetType().FullName + ": " + e.Message; | |
if (!exceptions.TryAdd(em, (1, reply.Pid))) | |
{ | |
var a = exceptions[em]; | |
a.times++; | |
a.pid = reply.Pid; | |
exceptions[em] = a; | |
} | |
// _logger.LogError(e, "exception spid:{} content:{}", subReply.Spid, subReply.Content); | |
} | |
} | |
SaveAndLog(); | |
// _logger.LogInformation("{}", JsonSerializer.Serialize(regexMatchesCount)); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE `tbm_f0_replies_img` ( | |
`id` int unsigned NOT NULL AUTO_INCREMENT, | |
`pid` bigint unsigned NOT NULL, | |
`hashOrUrl` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, | |
`bsizeWidth` int unsigned DEFAULT NULL, | |
`bsizeHeight` int unsigned DEFAULT NULL, | |
`byteSize` int unsigned DEFAULT NULL, | |
PRIMARY KEY (`id`), | |
UNIQUE KEY `pid` (`pid`,`hashOrUrl`(766)) USING BTREE | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment