Last active August 29, 2015 01:38
SC followers downloader
(ns sc.core
(:require [clj-http.client :as client]
[cheshire.core :refer :all]
[ :as io]))
;; !!!!!
;; FIXED version runs without problems
;; to run those functions you need enter below sound_client_id::::
(def soundcloud_client_id "CLIENT ID")
(def method "followers")
;; below are paths to store downloaded files with followers
;; !!! copy to path file with followers
(def path "/Volumes/ssd/SC/") ; set this path to store files with followers and copy there file with followers 140517.edn
;; copy file already_used.edn to this path
(def path1 "/Users/michal/sc/resources/")
;; logfile stores ids of already downloaded followers with theirs followers
(def logfile (str path1 "SClog"))
(def off 100) ; sets pagination offset
(def error_404_list (str path1 "error404list"))
(def error_counter (atom 1)) ; initialize error counter to monitor how many d/loads were wrong
(def howmanyleft (atom 0)) ; initialize to zero how many left to download
(def counter (atom 1)) ;; initialize counter for monitoring progress how many downloaded
(defn httpCall [url]
(client/get url {:socket-timeout 2000 :conn-timeout 2000}))
;; helper function checking if error 404 occured
(defn got404? [userid method offset] (= (try (client/get (str "" userid "/" method "?client_id=" soundcloud_client_id "&limit=100" "&linked_partitioning=1&offset=" offset) {:socket-timeout 2000 :conn-timeout 2000})
(catch Exception e (str (.getMessage e)))) "clj-http: status 404"))
;; fixed function catching 404_errors
(defn SCcall100 "http call with choosen method and with client id and linked partitioning set to 100" [userid method offset]
(let [call100 (try (httpCall (str "" userid "/" method "?client_id=" soundcloud_client_id "&limit=100" "&linked_partitioning=1&offset=" offset))
(catch Exception e))]
(if (= (:status call100) 200)
(if (got404? userid method offset)
(do (println (str "user doesn't exist!!!" userid)) (spit error_404_list (str userid "\n") :append true))
(loop [retry 1]
(if (< retry 10)
(let [retry_call (try (httpCall (str "" userid "/" method "?client_id=" soundcloud_client_id "&limit=100" "&linked_partitioning=1&offset=" offset))
(catch Exception e))
_ (println (str userid "_" offset " will retry, retry: " retry))]
(if (= (:status retry_call) 200)
(recur (inc retry)))
(defn user_map [user]
{:id (user "id"),
:followers_count (user "followers_count"), :country (user "country"), :full_name (user "full_name"),
:track_count (user "track_count") :plan (user "plan"), :followings_count (user "followings_count"), :last_modified (user "last_modified"), :description (user "description")
(defn delete_partial_files "used to deleted partially downloaded files when all of partial files are succesflly downloaded" [userid]
(loop [offset 0]
(let [filename (str path userid "_" offset)]
(if (try (io/delete-file filename) (catch Exception e))
(recur (+ offset off))
(defn get_followers [userid offset]
(let [call_rough (SCcall100 userid "followers" offset)]
(if (= call_rough nil) ; this means there was http error or timeout
(do (swap! error_counter inc) (println (str "\n !!!!! "userid " ERROR !!!!") )"error")
;; (do (spit (str path "error") (str userid offset "\n") :append true) (swap! error_counter inc) "error")
(let [call (call_rough :body)
parsed ((parse-string call) "collection")
filename (str path userid "_" offset)
;; below commented out are extra features showing progress of downloading parts
;; _ (print (str "*"
;; userid "_" offset
;; ))
(if (= call "{\"collection\":[]}") ; this means end of collection
(spit filename (with-out-str (pr parsed))))))))
(defn join_followers "used to join sucessfully downloaded parts" [userid]
(loop [offset 0, acc []]
(let [filename (str path userid "_" offset)
try_followers (try (slurp filename)
(catch Exception e))]
(if (= try_followers nil) acc
(recur (+ offset off) (into acc (read-string try_followers)))))))
(defn get_all_users_followers [userid]
(loop [offset 0]
(let [followers (get_followers userid offset)]
(if (= followers "error") (delete_partial_files userid) ; encountered error delete partial files
(if (= followers "no_more_followers") ; no more followers time to join them and log them
(do (spit (str path userid ".edn") (with-out-str (pr (join_followers userid))))
(spit logfile (str userid "\n") :append true) ; adds sucessfully downloaded follower to logfile
(println "downloaded " (str @counter) " left " (str @howmanyleft)
;; (str userid) ; extra feature showing which follower was downloaded
(swap! howmanyleft dec)
;; (println " ")
(swap! counter inc)
(delete_partial_files userid))
(recur (+ offset off)))))))
;; sequential method to download all followers of user
(defn get_all_followers_of_users_followers [userid]
(let [already_downloaded (reduce #(conj %1 %2) (hash-set) (re-seq #"\w+" (slurp logfile))) ; reads from file and adds to hash-set
users_dont_eixt (reduce #(conj %1 %2) (hash-set) (re-seq #"\w+" (slurp error_404_list))) ; reads users that don't exist anymore
users_to_filter_out (into already_downloaded users_dont_eixt)
followers_list (mapv (fn [x] ((user_map x) :id)) (read-string (slurp (str path userid ".edn")))) ; reads followers from file
followers_not_yet_downloaded (filter #(not (contains? users_to_filter_out (str %))) followers_list) ; filters followers that are already downloaded or don't exit
_ (println "sequential how many followers: " (count followers_list)
"difference: " (- (count followers_list) (count followers_not_yet_downloaded))
"sequential how many to download: " (count followers_not_yet_downloaded))]
(doall (map get_all_users_followers followers_not_yet_downloaded))))
;; chunked parallel method works faster than simple parallel method but leaves some followers not downloaded
(defn get_all_followers_of_users_followers_chunked [userid]
(let [already_exhausted (reduce #(conj %1 %2) (hash-set) (read-string (slurp (str path1 "already_used.edn"))))]
(if (contains? already_exhausted userid) (println "\n \n Already done \n")
(let [already_downloaded (reduce #(conj %1 %2) (hash-set) (re-seq #"\w+" (slurp logfile)))
users_dont_eixt (reduce #(conj %1 %2) (hash-set) (re-seq #"\w+" (slurp error_404_list)))
users_to_filter_out (into already_downloaded users_dont_eixt)
followers_list (mapv (fn [x] ((user_map x) :id)) (read-string (slurp (str path userid ".edn"))))
followers_not_yet_downloaded (filter #(not (contains? users_to_filter_out (str %))) followers_list)
number_of_followers_to_download (count followers_not_yet_downloaded)
_ (def howmanyleft (atom number_of_followers_to_download))
_ (println "how many followers: " (count followers_list)
"difference: " (- (count followers_list) number_of_followers_to_download)
"how many to download: " number_of_followers_to_download)
partition_size (cond
(> number_of_followers_to_download 2700) 110 ; various chunk sizes depending on how many followers to download
(> number_of_followers_to_download 1800) 80
(> number_of_followers_to_download 1000) 64
(> number_of_followers_to_download 700) 40
(> number_of_followers_to_download 320) 20
(> number_of_followers_to_download 160) 12
(> number_of_followers_to_download 80) 8
(> number_of_followers_to_download 30) 6
(> number_of_followers_to_download 20) 5
(> number_of_followers_to_download 10) 4
(> number_of_followers_to_download 3) 2
:else 1)]
(if (= number_of_followers_to_download 0) (spit (str path1 "already_used.edn") (with-out-str (pr (conj already_exhausted userid))))
(doall (pmap (fn [x] (doall (pmap get_all_users_followers x))) (partition partition_size followers_not_yet_downloaded))))))))
;; function using both sequential and parallel methods to download followers
(defn id_sucker [id]
[already_exhausted (reduce #(conj %1 %2) (hash-set) (read-string (slurp (str path1 "already_used.edn"))))] ;don't download already d/loaded users
(if (contains? already_exhausted id) (println "\n \n Already done \n")
(loop [i 0]
(when (< i 15) ; trying 15 times parallel d/load method
(do (time (get_all_followers_of_users_followers_chunked id))
(recur (inc i)))))
(get_all_followers_of_users_followers id) ; trying 1 time to d/load rest of followers with sequential method
(println "errors " (str @error_counter) "user id" (str id))))))
(defn -main []
(do (println "enter user id or ids: ")
(let [ids (map read-string (re-seq #"\w+" (read-line)))]
(time (doall (map id_sucker ids))))))
(-main) ; main method when calling from terminal enter 140517 , later you can call with multiple uers ids (they need to be already downloaded)
