Skip to content

Instantly share code, notes, and snippets.

Last active July 28, 2022 21:14
Show Gist options
  • Save n0099/a2ba4bacb47e31631fd1cf470a08b5c1 to your computer and use it in GitHub Desktop.
Save n0099/a2ba4bacb47e31631fd1cf470a08b5c1 to your computer and use it in GitHub Desktop.
using System.Text.Encodings.Web;
using TbClient.Post.Common;
using static System.Text.Json.JsonSerializer;
namespace tbm.Crawler
public class MigrationWorker : BackgroundService
private readonly ILogger<MigrationWorker> _logger;
private readonly ILifetimeScope _scope0;
public MigrationWorker(ILogger<MigrationWorker> logger, ILifetimeScope scope0)
_logger = logger;
_scope0 = scope0;
private static readonly JsonSerializerOptions JsonSerializerOptions = new()
IncludeFields = true,
Encoder = JavaScriptEncoder.UnsafeRelaxedJsonEscaping,
// DefaultIgnoreCondition = JsonIgnoreCondition.WhenWritingDefault
protected override async Task ExecuteAsync(CancellationToken stoppingToken)
await using var scope1 = _scope0.BeginLifetimeScope();
var db = scope1.Resolve<TbmDbContext.New>()(0);
var fids = from f in db.ForumsInfo select f.Fid;
foreach (var fid in fids)
_logger.LogInformation("converting for fid:{} started", fid);
await Convert(fid);
_logger.LogInformation("converting for fid:{} finished", fid);
private async Task Convert(Fid fid)
await using var scope1 = _scope0.BeginLifetimeScope();
var db = scope1.Resolve<TbmDbContext.New>()(fid);
var db2 = scope1.Resolve<TbmDbContext.New>()(fid);
var replies = from p in db.ReplyContents where p.Content != null select p;
var i = 0;
using var process = Process.GetCurrentProcess();
var stopwatch = new Stopwatch();
var exceptions = new Dictionary<string, (uint times, ulong pid, string content)>();
var newReplies = new List<ReplyContent>();
void SaveAndLog()
foreach (var e in db2.ChangeTracker.Entries<ReplyContent>())
e.Property(nameof(ReplyContent.Content)).IsModified = true;
_ = db2.SaveChangesWithoutTimestamping();
_logger.LogTrace("i:{} elapsed:{}ms mem:{}kb exceptions:{}", i,
process.PrivateMemorySize64 / 1024,
Serialize(exceptions, JsonSerializerOptions));
foreach (var reply in replies.AsNoTracking())
if (i % 10000 == 0) SaveAndLog();
var content = PostContentWrapper.Parser.ParseFrom(reply.Content);
var isType3 = false;
foreach (var c in content.Value.Where(c => c.Type == 3))
c.BigCdnSrc = "";
c.CdnSrc = "";
c.CdnSrcActive = "";
c.ShowOriginalBtn = 0;
c.IsLongPic = 0;
isType3 = true;
if (!isType3) continue;
Pid = reply.Pid,
Content = content.ToByteArray()
catch (Exception e)
var em = e.GetType().FullName + ": " + e.Message;
if (!exceptions.TryAdd(em, (1, reply.Pid, "")))
var a = exceptions[em];
a.times++; = reply.Pid;
a.content = Serialize(content);
exceptions[em] = a;
public class Stats
public Fid Fid { get; set; }
public uint Type { get; set; }
public string FieldNames { get; set; } = "";
public int TotalBytesSize { get; set; }
public uint TotalCounts { get; set; }
public ulong SamplePid { get; set; }
public string SampleContent { get; set; } = "";
private async Task GenerateStats(Fid fid)
await using var scope1 = _scope0.BeginLifetimeScope();
var db = scope1.Resolve<TbmDbContext.New>()(fid);
var db2 = scope1.Resolve<TbmDbContext.New>()(fid);
var replies = from p in db.ReplyContents where p.Content != null select p;
var i = 0;
using var process = Process.GetCurrentProcess();
var stopwatch = new Stopwatch();
var stats = new Dictionary<(uint Type, string FieldNames), Stats>();
var exceptions = new Dictionary<string, (uint times, ulong pid, string content)>();
void SaveAndLog()
_logger.LogTrace("i:{} elapsed:{}ms mem:{}kb exceptions:{}", i,
process.PrivateMemorySize64 / 1024,
Serialize(exceptions, JsonSerializerOptions));
foreach (var reply in replies.AsNoTracking())
if (i % 50000 == 0) SaveAndLog();
var content = PostContentWrapper.Parser.ParseFrom(reply.Content).Value;
foreach (var singleType in content)
if (singleType == null) continue;
var fieldNames = string.Join(',', typeof(Content).GetProperties(BindingFlags.DeclaredOnly | BindingFlags.Instance | BindingFlags.Public)
// ReSharper disable once RedundantCast
.Where(p => p.Name != nameof(Content.Type)
&& p.GetValue(singleType) is not (null or (short)0 or (ushort)0 or 0 or (uint)0 or (long)0 or (ulong)0 or ""))
.Select(p => p.Name));
var typeBytesSize = singleType.CalculateSize();
var uniqueStats = (singleType.Type, fieldNames);
if (stats.TryAdd(uniqueStats, new()
Fid = fid,
Type = singleType.Type,
FieldNames = fieldNames,
TotalBytesSize = typeBytesSize,
TotalCounts = 1,
SamplePid = reply.Pid,
SampleContent = JsonFormatter.Default.Format(singleType)
})) continue;
stats[uniqueStats].TotalBytesSize += typeBytesSize;
catch (Exception e)
var em = e.GetType().FullName + ": " + e.Message;
if (!exceptions.TryAdd(em, (1, reply.Pid, "")))
var a = exceptions[em];
a.times++; = reply.Pid;
a.content = Serialize(content, JsonSerializerOptions);
exceptions[em] = a;
// _logger.LogError(e, "exception spid:{} content:{}", subReply.Spid, subReply.Content);
// _logger.LogCritical(Serialize(stats.ToDictionary(pair => $"Type={pair.Key.Type},FieldNames={pair.Key.FieldNames}", pair => pair.Value), JsonSerializerOptions));
CREATE TABLE `tbm_reply_contents_stats` (
`fid` int NOT NULL,
`type` int unsigned NOT NULL,
`fieldNames` text CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci NOT NULL,
`totalBytesSize` int unsigned NOT NULL,
`totalCounts` int unsigned NOT NULL,
`samplePid` bigint unsigned NOT NULL,
`sampleContent` json NOT NULL,
PRIMARY KEY (`type`,`fieldNames`(766),`fid`) USING BTREE
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment