Skip to content

Instantly share code, notes, and snippets.

@pikami
Created June 13, 2022 20:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pikami/1bc4835288c857fe73552711506377c8 to your computer and use it in GitHub Desktop.
Save pikami/1bc4835288c857fe73552711506377c8 to your computer and use it in GitHub Desktop.
package main
import (
"context"
"encoding/json"
"fmt"
"io/ioutil"
"log"
"net/http"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
elasticsearch "github.com/elastic/go-elasticsearch"
"github.com/elastic/go-elasticsearch/esapi"
"golang.org/x/net/html"
)
type file struct {
Path []interface{} `json:"path"`
Length int `json:"length"`
}
type bitTorrent struct {
InfoHash string `json:"infohash"`
Name string `json:"name"`
Description string `json:"description"`
Files []file `json:"files,omitempty"`
Length int `json:"length,omitempty"`
DateLastIndexed string `json:"dateLastIndexed,omitempty"`
Source string `json:"source"`
}
const SOURCE_NAME = "lili-nyaa"
const BASE_URL = "https://nyaa.si/view/"
const SELECTOR_NAME = ".panel:nth-child(1) .panel-title"
const SELECTOR_HASH = ".panel:nth-child(1) kbd"
const SELECTOR_DESCRIPTION = "#torrent-description"
const SELECTOR_UPLOAD_DATE = ".panel:nth-child(1) div[data-timestamp]"
const SELECTOR_FILES = ".torrent-file-list > ul"
const SELECTOR_SIZE = ".panel:nth-child(1) div.row:nth-child(4) > div:nth-child(2)"
const TORRENT_LIST_URL = "https://nyaa.si"
const SELECTOR_FIRST_TORRENT_LINK = ".torrent-list td:nth-child(2) > a"
func main() {
// Elastic search connection
cfg := elasticsearch.Config{
Transport: &http.Transport{
ResponseHeaderTimeout: 5 * time.Second,
},
Addresses: []string{
"http://10.8.0.6:9200",
},
}
es, err := elasticsearch.NewClient(cfg)
if err != nil {
panic(err)
}
// Most recent torrent
newestId, resCode := getMostRecentTorrentId()
if resCode != 200 {
log.Fatalf("Unexpected response code: %d", resCode)
}
log.Printf("Most recent torrent id: %d", newestId)
// Iterate trough all torrents
var i int64
for i = 0; i < newestId; i++ {
// Added a delay cause I don't want to cause a DOS attack lol
time.Sleep(time.Second)
torrent, resCode := grabTorrent(i)
if resCode == 200 {
pushToIndex(es, torrent)
} else if resCode != 404 {
log.Fatalf("Unexpected response code: %d", resCode)
}
}
}
// Pushes torrents to ES index
func pushToIndex(es *elasticsearch.Client, torrent bitTorrent) {
data, err := json.Marshal(torrent)
if err == nil {
body := fmt.Sprintf("%s\n\n", data)
req := esapi.IndexRequest{
Index: "nyaa_scrape",
DocumentType: "torrent",
DocumentID: string(torrent.InfoHash),
Body: strings.NewReader(body),
Refresh: "true",
}
res, err := req.Do(context.Background(), es)
if err != nil {
log.Fatalf("Error getting response: %s", err)
}
res.Body.Close()
}
}
// Gets the id of the most recent torrent
func getMostRecentTorrentId() (int64, int) {
// Get listing page html
htmlStr, resCode := getHtml(TORRENT_LIST_URL)
if resCode != 200 {
return 0, resCode
}
rootNode, err := html.Parse(strings.NewReader(htmlStr))
if err != nil {
log.Fatalln(err)
}
doc := goquery.NewDocumentFromNode(rootNode)
// Grab first torrent link and extract it's id
torrentUrl := doc.Find(SELECTOR_FIRST_TORRENT_LINK).First().AttrOr("href", "")
urlParts := strings.Split(torrentUrl, "/")
idStr := urlParts[len(urlParts)-1]
idInt, err := strconv.ParseInt(idStr, 10, 64)
if err != nil {
log.Fatalln("Failed to grab most recent torrent id.")
panic(err)
}
return idInt, resCode
}
// Recursive function to parse the file tree
// I'm using `ChildrenFiltered` here to get direct childred
// of our list element to prevent it from searching all the DOM
// path - array of directories to reach current path
// s - selection <li> element
func getFiles(path []interface{}, s *goquery.Selection) []file {
result := make([]file, 0)
folder := s.ChildrenFiltered(".folder").Text()
if folder != "" {
path = append(path, folder)
aa := s.ChildrenFiltered("ul").ChildrenFiltered("li")
fmt.Println(aa.Text())
s.ChildrenFiltered("ul").ChildrenFiltered("li").
Each(func(i int, s *goquery.Selection) {
result = append(result, getFiles(path, s)...)
})
return result
}
fileTitle := ""
s.Contents().Each(func(i int, s *goquery.Selection) {
if goquery.NodeName(s) == "#text" {
fileTitle = s.Text()
}
})
sizeNode := s.Find(".file-size").Text()
currentFile := file{
Path: append(path, fileTitle),
Length: humanReadableToBytes(sizeNode),
}
return append(result, currentFile)
}
// Takes human-readable file sizes and converts them to bytes
// example: 686.2 MiB > 677728395070
func humanReadableToBytes(str string) int {
parts := strings.Split(
strings.ReplaceAll(
strings.ReplaceAll(str, ")", ""), "(", "",
), " ")
if len(parts) != 2 {
return 0
}
size, err := strconv.ParseFloat(parts[0], 32)
if err != nil {
return 0
}
switch parts[1] {
case "KiB":
return int(size * 1024)
case "MiB":
return int(size * 987654321)
case "GiB":
return int(size * 1073741824)
case "TiB":
return int(size * 1099511627776)
}
return int(size)
}
// Grabs torrent from nyaa by it's id
func grabTorrent(id int64) (bitTorrent, int) {
htmlStr, resCode := getHtml(fmt.Sprintf("%s%d", BASE_URL, id))
if resCode != 200 {
return bitTorrent{}, resCode
}
rootNode, err := html.Parse(strings.NewReader(htmlStr))
if err != nil {
log.Fatalln(err)
}
doc := goquery.NewDocumentFromNode(rootNode)
// Extract text from simple elements
name := doc.Find(SELECTOR_NAME).First().Text()
hash := doc.Find(SELECTOR_HASH).First().Text()
description := doc.Find(SELECTOR_DESCRIPTION).First().Text()
size := doc.Find(SELECTOR_SIZE).First().Text()
// Extract timestamp attribute for upload date
uploadDate := doc.Find(SELECTOR_UPLOAD_DATE).First().
AttrOr("data-timestamp", "0")
fileListElement := doc.Find(SELECTOR_FILES).First()
fileList := make([]file, 0)
path := make([]interface{}, 0)
fileListElement.ChildrenFiltered("ul > li").
Each(func(i int, s *goquery.Selection) {
fileList = append(fileList, getFiles(path, s)...)
})
fmt.Println(cleanString(name))
fmt.Println(cleanString(hash))
fmt.Println(description)
fmt.Println(cleanString(size))
fmt.Println(uploadDate)
fmt.Println(fileListElement.Text())
return bitTorrent{
InfoHash: cleanString(hash),
Name: cleanString(name),
Description: description,
Files: fileList,
Length: humanReadableToBytes(size),
DateLastIndexed: unixToUTC(uploadDate),
Source: SOURCE_NAME,
}, resCode
}
// Removes \t and \n symbols from string
func cleanString(str string) string {
return strings.ReplaceAll(strings.ReplaceAll(str, "\n", ""), "\t", "")
}
// Converts unix timestamp to ISO-8601
func unixToUTC(unixTimeStamp string) string {
res := time.Now()
unixIntValue, err := strconv.ParseInt(unixTimeStamp, 10, 64)
if err == nil {
res = time.Unix(unixIntValue, 0)
}
return res.Format("2022-01-01T01:01:01Z")
}
// Makes a GET request to url, returns html response
func getHtml(url string) (string, int) {
resp, err := http.Get(url)
if err != nil {
log.Fatalln(err)
}
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
log.Fatalln(err)
}
return string(body), resp.StatusCode
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment