Created
February 27, 2021 00:22
-
-
Save kosorin/115dc339add0e8953443020901454f5c to your computer and use it in GitHub Desktop.
ČSFD exporter (to IMDb format)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<Project Sdk="Microsoft.NET.Sdk"> | |
<PropertyGroup> | |
<OutputType>Exe</OutputType> | |
<TargetFramework>net5.0</TargetFramework> | |
<Nullable>enable</Nullable> | |
</PropertyGroup> | |
<ItemGroup> | |
<PackageReference Include="HtmlAgilityPack" Version="1.11.30" /> | |
</ItemGroup> | |
</Project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using HtmlAgilityPack; | |
using System; | |
using System.Collections.Generic; | |
using System.Globalization; | |
using System.IO; | |
using System.Linq; | |
using System.Net; | |
using System.Text; | |
using System.Text.RegularExpressions; | |
try | |
{ | |
if (args.Length < 1) | |
{ | |
Console.Error.WriteLine("Missing argument: UserId"); | |
return 2; | |
} | |
if (args.Length < 2) | |
{ | |
Console.Error.WriteLine("Missing argument: OutputPath"); | |
return 2; | |
} | |
var userId = args[0]; | |
var outputPath = args[1]; | |
using var api = new CsfdApi("https://www.csfd.cz"); | |
using (var writer = new StreamWriter(outputPath, append: false, Encoding.UTF8)) | |
{ | |
writer.WriteLine("Const,Your Rating,Date Rated,Title,URL,Title Type,IMDb Rating,Runtime (mins),Year,Genres,Num Votes,Release Date,Directors"); | |
foreach (var x in api.GetUserRatings(userId).Where(x => x.ImdbId != null)) | |
{ | |
var imdb = "tt" + x.ImdbId; | |
var rating = (int)Math.Round(1 + (x.Rating * (9d / 5d))); | |
var date = x.Date.ToString("yyyy-MM-dd", CultureInfo.InvariantCulture); | |
var title = Regex.Replace(x.Title, @"\W", "-"); | |
writer.WriteLine($"{imdb},{rating},{date},{title},,movie,,,,,,,"); | |
} | |
} | |
return 0; | |
} | |
catch (Exception ex) | |
{ | |
Console.Error.WriteLine(ex); | |
return 1; | |
} | |
internal record UserRating(string Title, int Rating, DateTime Date, string CsfdId, string ImdbId); | |
internal sealed class CsfdApi : IDisposable | |
{ | |
private readonly UrlBuilder _urlBuilder; | |
private readonly HttpsWebClient _webClient; | |
private bool _disposed; | |
public CsfdApi(string rootUrl) | |
{ | |
_urlBuilder = new UrlBuilder(rootUrl); | |
_webClient = new HttpsWebClient(); | |
} | |
public void Dispose() | |
{ | |
if (_disposed) | |
{ | |
return; | |
} | |
_webClient.Dispose(); | |
_disposed = true; | |
GC.SuppressFinalize(this); | |
} | |
public IEnumerable<UserRating> GetUserRatings(string userId) | |
{ | |
var documentNode = DownloadHtml(_urlBuilder.UserRating(userId, 1)); | |
var pageCount = 1 + documentNode.SelectNodes("//div[@class='paginator text'][1]/a[not(@class)]").Count; | |
for (var i = 1u; i <= pageCount; i++) | |
{ | |
var nodes = documentNode.SelectNodes("//table[@class='ui-table-list']/tbody/tr"); | |
foreach (var node in nodes) | |
{ | |
var rating = node.SelectSingleNode("td[2]/img")?.Attributes["alt"].ValueLength ?? 0; | |
var date = DateTime.ParseExact(node.SelectSingleNode("td[3]").InnerText, "dd.MM.yyyy", CultureInfo.InvariantCulture); | |
var filmNode = node.SelectSingleNode("td[1]/a"); | |
var csfdId = Regex.Match(filmNode.Attributes["href"].Value, @"^/film/(?<Id>[^-]+)-").Groups["Id"].Value; | |
var title = filmNode.InnerText; | |
string? imdbId = null; | |
var filmDocumentNode = DownloadHtml(_urlBuilder.Film(csfdId)); | |
var shareLinkNodes = filmDocumentNode.SelectNodes("//div[@id='share']/ul[@class='links']/li/a"); | |
foreach (var shareLinkNode in shareLinkNodes) | |
{ | |
var shareLinkUrl = shareLinkNode.Attributes["href"].Value; | |
if (Regex.Match(shareLinkUrl, @"imdb\.com/title/tt(?<Id>[0-9]+)") is { Success: true, Groups: var groups }) | |
{ | |
imdbId = groups["Id"].Value; | |
break; | |
} | |
} | |
if (imdbId == null) | |
{ | |
Console.Error.WriteLine($"Missing IMDb ID for ČSFD ID {csfdId} ({title})"); | |
continue; | |
} | |
var result = new UserRating(title, rating, date, csfdId, imdbId); | |
Console.WriteLine(result); | |
yield return result; | |
} | |
if (i < pageCount) | |
{ | |
documentNode = DownloadHtml(_urlBuilder.UserRating(userId, i + 1)); | |
} | |
} | |
} | |
private HtmlNode DownloadHtml(string url) | |
{ | |
var html = _webClient.DownloadString(url); | |
var document = new HtmlDocument(); | |
document.LoadHtml(html); | |
return document.DocumentNode; | |
} | |
private class HttpsWebClient : WebClient | |
{ | |
protected override WebRequest GetWebRequest(Uri address) | |
{ | |
var request = base.GetWebRequest(address); | |
if (request is HttpWebRequest httpRequest) | |
{ | |
httpRequest.AutomaticDecompression = DecompressionMethods.Deflate | DecompressionMethods.GZip; | |
} | |
return request; | |
} | |
} | |
private class UrlBuilder | |
{ | |
public UrlBuilder(string rootUrl) | |
{ | |
RootUrl = rootUrl; | |
} | |
private string RootUrl { get; } | |
public string Film(string filmId) => $"{RootUrl}/film/{filmId}/prehled/"; | |
public string UserRating(string userId, uint page) => $"{RootUrl}/uzivatel/{userId}/hodnoceni/strana-{page}/"; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment