Skip to content

Instantly share code, notes, and snippets.

@ipkpjersi
Last active August 12, 2023 00:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ipkpjersi/3ce754391ab8390f23d9b9f80fca3994 to your computer and use it in GitHub Desktop.
Save ipkpjersi/3ce754391ab8390f23d9b9f80fca3994 to your computer and use it in GitHub Desktop.
Lemmy User Comment Scraper
<?php
function fetchContent($url) {
$production = false; //Change as needed
$ch = curl_init($url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
if (empty($production)) {
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0);
}
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_MAXREDIRS, 10);
curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
$content = curl_exec($ch);
if (curl_errno($ch)) {
die('Curl error: ' . curl_error($ch));
}
curl_close($ch);
return $content;
}
$instance = isset($_POST['instance']) ? $_POST['instance'] : (isset($_GET['instance']) ? $_GET['instance'] : null);
$username = isset($_POST['username']) ? $_POST['username'] : (isset($_GET['username']) ? $_GET['username'] : null);
if (!$instance || !$username) {
die(json_encode(['error' => 'Both "instance" and "username" parameters are required.']));
}
//Sanitize the user input
$instance = filter_var($instance, FILTER_SANITIZE_URL);
$username = preg_replace("/[^a-zA-Z0-9_\-]/", '', $username);
if (!$instance || !$username) {
die(json_encode(['error' => 'Invalid "instance" or "username" provided.']));
}
$url = "$instance/api/v3/user/?username=$username&limit=50";
//Initial fetch to get the comment_count
$content = fetchContent($url);
$data = json_decode($content, true);
$all_comments = [];
$total_pages = ceil($data['person_view']['counts']['comment_count'] / 50);
$date = date("m-d-Y_H-i-s");
//Fetch all comments
for ($i = 1; $i <= $total_pages; $i++) {
$page_url = "$url&page=$i";
$page_content = fetchContent($page_url);
$page_data = json_decode($page_content, true);
$all_comments = array_merge($all_comments, $page_data['comments']);
}
//Replace the 'comments' section in the last page_data with our merged comments
$page_data['comments'] = $all_comments;
//Append additional data
$page_data['website'] = $instance;
$page_data['username'] = $username;
$page_data['date'] = $date;
$filename = str_replace(["https://", "https://www."], "", $instance) . "-$username-" . $date . ".json";
header('Content-Disposition: attachment; filename=' . $filename);
header('Content-Type: application/json');
echo json_encode($page_data);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment