Last active
July 28, 2022 21:14
-
-
Save n0099/a2ba4bacb47e31631fd1cf470a08b5c1 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using System.Text.Encodings.Web; | |
using TbClient.Post.Common; | |
using static System.Text.Json.JsonSerializer; | |
namespace tbm.Crawler | |
{ | |
public class MigrationWorker : BackgroundService | |
{ | |
private readonly ILogger<MigrationWorker> _logger; | |
private readonly ILifetimeScope _scope0; | |
public MigrationWorker(ILogger<MigrationWorker> logger, ILifetimeScope scope0) | |
{ | |
_logger = logger; | |
_scope0 = scope0; | |
} | |
private static readonly JsonSerializerOptions JsonSerializerOptions = new() | |
{ | |
IncludeFields = true, | |
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping, | |
// DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault | |
}; | |
protected override async Task ExecuteAsync(CancellationToken stoppingToken) | |
{ | |
await using var scope1 = _scope0.BeginLifetimeScope(); | |
var db = scope1.Resolve<TbmDbContext.New>()(0); | |
var fids = from f in db.ForumsInfo select f.Fid; | |
foreach (var fid in fids) | |
{ | |
_logger.LogInformation("converting for fid:{} started", fid); | |
await Convert(fid); | |
_logger.LogInformation("converting for fid:{} finished", fid); | |
} | |
Environment.Exit(0); | |
} | |
private async Task Convert(Fid fid) | |
{ | |
await using var scope1 = _scope0.BeginLifetimeScope(); | |
var db = scope1.Resolve<TbmDbContext.New>()(fid); | |
var db2 = scope1.Resolve<TbmDbContext.New>()(fid); | |
var replies = from p in db.ReplyContents where p.Content != null select p; | |
var i = 0; | |
using var process = Process.GetCurrentProcess(); | |
var stopwatch = new Stopwatch(); | |
stopwatch.Start(); | |
var exceptions = new Dictionary<string, (uint times, ulong pid, string content)>(); | |
var newReplies = new List<ReplyContent>(); | |
void SaveAndLog() | |
{ | |
db2.ReplyContents.AttachRange(newReplies); | |
foreach (var e in db2.ChangeTracker.Entries<ReplyContent>()) | |
e.Property(nameof(ReplyContent.Content)).IsModified = true; | |
_ = db2.SaveChangesWithoutTimestamping(); | |
newReplies.Clear(); | |
db2.ChangeTracker.Clear(); | |
_logger.LogTrace("i:{} elapsed:{}ms mem:{}kb exceptions:{}", i, | |
stopwatch.ElapsedMilliseconds, | |
process.PrivateMemorySize64 / 1024, | |
Serialize(exceptions, JsonSerializerOptions)); | |
stopwatch.Restart(); | |
} | |
foreach (var reply in replies.AsNoTracking()) | |
{ | |
i++; | |
if (i % 10000 == 0) SaveAndLog(); | |
var content = PostContentWrapper.Parser.ParseFrom(reply.Content); | |
var isType3 = false; | |
foreach (var c in content.Value.Where(c => c.Type == 3)) | |
{ | |
c.BigCdnSrc = ""; | |
c.CdnSrc = ""; | |
c.CdnSrcActive = ""; | |
c.ShowOriginalBtn = 0; | |
c.IsLongPic = 0; | |
isType3 = true; | |
} | |
if (!isType3) continue; | |
try | |
{ | |
newReplies.Add(new() | |
{ | |
Pid = reply.Pid, | |
Content = content.ToByteArray() | |
}); | |
} | |
catch (Exception e) | |
{ | |
var em = e.GetType().FullName + ": " + e.Message; | |
if (!exceptions.TryAdd(em, (1, reply.Pid, ""))) | |
{ | |
var a = exceptions[em]; | |
a.times++; | |
a.pid = reply.Pid; | |
a.content = Serialize(content); | |
exceptions[em] = a; | |
} | |
} | |
} | |
SaveAndLog(); | |
} | |
public class Stats | |
{ | |
public Fid Fid { get; set; } | |
public uint Type { get; set; } | |
public string FieldNames { get; set; } = ""; | |
public int TotalBytesSize { get; set; } | |
public uint TotalCounts { get; set; } | |
public ulong SamplePid { get; set; } | |
public string SampleContent { get; set; } = ""; | |
} | |
private async Task GenerateStats(Fid fid) | |
{ | |
await using var scope1 = _scope0.BeginLifetimeScope(); | |
var db = scope1.Resolve<TbmDbContext.New>()(fid); | |
var db2 = scope1.Resolve<TbmDbContext.New>()(fid); | |
var replies = from p in db.ReplyContents where p.Content != null select p; | |
var i = 0; | |
using var process = Process.GetCurrentProcess(); | |
var stopwatch = new Stopwatch(); | |
stopwatch.Start(); | |
var stats = new Dictionary<(uint Type, string FieldNames), Stats>(); | |
var exceptions = new Dictionary<string, (uint times, ulong pid, string content)>(); | |
void SaveAndLog() | |
{ | |
_logger.LogTrace("i:{} elapsed:{}ms mem:{}kb exceptions:{}", i, | |
stopwatch.ElapsedMilliseconds, | |
process.PrivateMemorySize64 / 1024, | |
Serialize(exceptions, JsonSerializerOptions)); | |
stopwatch.Restart(); | |
} | |
foreach (var reply in replies.AsNoTracking()) | |
{ | |
i++; | |
if (i % 50000 == 0) SaveAndLog(); | |
var content = PostContentWrapper.Parser.ParseFrom(reply.Content).Value; | |
try | |
{ | |
foreach (var singleType in content) | |
{ | |
if (singleType == null) continue; | |
var fieldNames = string.Join(',', typeof(Content).GetProperties(BindingFlags.DeclaredOnly | BindingFlags.Instance | BindingFlags.Public) | |
// ReSharper disable once RedundantCast | |
.Where(p => p.Name != nameof(Content.Type) | |
&& p.GetValue(singleType) is not (null or (short)0 or (ushort)0 or 0 or (uint)0 or (long)0 or (ulong)0 or "")) | |
.Select(p => p.Name)); | |
var typeBytesSize = singleType.CalculateSize(); | |
var uniqueStats = (singleType.Type, fieldNames); | |
if (stats.TryAdd(uniqueStats, new() | |
{ | |
Fid = fid, | |
Type = singleType.Type, | |
FieldNames = fieldNames, | |
TotalBytesSize = typeBytesSize, | |
TotalCounts = 1, | |
SamplePid = reply.Pid, | |
SampleContent = JsonFormatter.Default.Format(singleType) | |
})) continue; | |
stats[uniqueStats].TotalBytesSize += typeBytesSize; | |
stats[uniqueStats].TotalCounts++; | |
} | |
} | |
catch (Exception e) | |
{ | |
var em = e.GetType().FullName + ": " + e.Message; | |
if (!exceptions.TryAdd(em, (1, reply.Pid, ""))) | |
{ | |
var a = exceptions[em]; | |
a.times++; | |
a.pid = reply.Pid; | |
a.content = Serialize(content, JsonSerializerOptions); | |
exceptions[em] = a; | |
} | |
// _logger.LogError(e, "exception spid:{} content:{}", subReply.Spid, subReply.Content); | |
} | |
} | |
SaveAndLog(); | |
// _logger.LogCritical(Serialize(stats.ToDictionary(pair => $"Type={pair.Key.Type},FieldNames={pair.Key.FieldNames}", pair => pair.Value), JsonSerializerOptions)); | |
db2.Set<Stats>().AddRange(stats.Values); | |
db2.SaveChangesWithoutTimestamping(); | |
} | |
} | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
CREATE TABLE `tbm_reply_contents_stats` ( | |
`fid` int NOT NULL, | |
`type` int unsigned NOT NULL, | |
`fieldNames` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL, | |
`totalBytesSize` int unsigned NOT NULL, | |
`totalCounts` int unsigned NOT NULL, | |
`samplePid` bigint unsigned NOT NULL, | |
`sampleContent` json NOT NULL, | |
PRIMARY KEY (`type`,`fieldNames`(766),`fid`) USING BTREE | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment