Skip to content

Instantly share code, notes, and snippets.

@standinga
Last active August 29, 2015 01:38
Show Gist options
  • Save standinga/03fcd1409cc44e4cb8b2 to your computer and use it in GitHub Desktop.
Save standinga/03fcd1409cc44e4cb8b2 to your computer and use it in GitHub Desktop.
SC followers downloader
(ns sc.core
(:require [clj-http.client :as client]
[cheshire.core :refer :all]
[clojure.string]
[clojure.java.io :as io]))
;; !!!!!
;; FIXED version runs without problems
;; to run those functions you need enter below sound_client_id::::
(def soundcloud_client_id "CLIENT ID")
(def method "followers")
;; below are paths to store downloaded files with followers
;; !!! copy to path file with followers
(def path "/Volumes/ssd/SC/") ; set this path to store files with followers and copy there file with followers 140517.edn
;; copy file already_used.edn to this path
(def path1 "/Users/michal/sc/resources/")
;; logfile stores ids of already downloaded followers with theirs followers
(def logfile (str path1 "SClog"))
(def off 100) ; sets pagination offset
(def error_404_list (str path1 "error404list"))
(def error_counter (atom 1)) ; initialize error counter to monitor how many d/loads were wrong
(def howmanyleft (atom 0)) ; initialize to zero how many left to download
(def counter (atom 1)) ;; initialize counter for monitoring progress how many downloaded
(defn httpCall [url]
(client/get url {:socket-timeout 2000 :conn-timeout 2000}))
;; helper function checking if error 404 occured
(defn got404? [userid method offset] (= (try (client/get (str "http://api.soundcloud.com/users/" userid "/" method "?client_id=" soundcloud_client_id "&limit=100" "&linked_partitioning=1&offset=" offset) {:socket-timeout 2000 :conn-timeout 2000})
(catch Exception e (str (.getMessage e)))) "clj-http: status 404"))
;; fixed function catching 404_errors
(defn SCcall100 "http call with choosen method and with client id and linked partitioning set to 100" [userid method offset]
(let [call100 (try (httpCall (str "http://api.soundcloud.com/users/" userid "/" method "?client_id=" soundcloud_client_id "&limit=100" "&linked_partitioning=1&offset=" offset))
(catch Exception e))]
(if (= (:status call100) 200)
call100
(if (got404? userid method offset)
(do (println (str "user doesn't exist!!!" userid)) (spit error_404_list (str userid "\n") :append true))
(loop [retry 1]
(if (< retry 10)
(let [retry_call (try (httpCall (str "http://api.soundcloud.com/users/" userid "/" method "?client_id=" soundcloud_client_id "&limit=100" "&linked_partitioning=1&offset=" offset))
(catch Exception e))
_ (println (str userid "_" offset " will retry, retry: " retry))]
(if (= (:status retry_call) 200)
retry_call
(recur (inc retry)))
)))))))
(defn user_map [user]
{:id (user "id"),
:followers_count (user "followers_count"), :country (user "country"), :full_name (user "full_name"),
:track_count (user "track_count") :plan (user "plan"), :followings_count (user "followings_count"), :last_modified (user "last_modified"), :description (user "description")
})
(defn delete_partial_files "used to deleted partially downloaded files when all of partial files are succesflly downloaded" [userid]
(loop [offset 0]
(let [filename (str path userid "_" offset)]
(if (try (io/delete-file filename) (catch Exception e))
(recur (+ offset off))
"deleted"))))
(defn get_followers [userid offset]
(let [call_rough (SCcall100 userid "followers" offset)]
(if (= call_rough nil) ; this means there was http error or timeout
(do (swap! error_counter inc) (println (str "\n !!!!! "userid " ERROR !!!!") )"error")
;; (do (spit (str path "error") (str userid offset "\n") :append true) (swap! error_counter inc) "error")
(let [call (call_rough :body)
parsed ((parse-string call) "collection")
filename (str path userid "_" offset)
;; below commented out are extra features showing progress of downloading parts
;; _ (print (str "*"
;; userid "_" offset
;; ))
]
(if (= call "{\"collection\":[]}") ; this means end of collection
"no_more_followers"
(spit filename (with-out-str (pr parsed))))))))
(defn join_followers "used to join sucessfully downloaded parts" [userid]
(loop [offset 0, acc []]
(let [filename (str path userid "_" offset)
try_followers (try (slurp filename)
(catch Exception e))]
(if (= try_followers nil) acc
(recur (+ offset off) (into acc (read-string try_followers)))))))
(defn get_all_users_followers [userid]
(loop [offset 0]
(let [followers (get_followers userid offset)]
(if (= followers "error") (delete_partial_files userid) ; encountered error delete partial files
(if (= followers "no_more_followers") ; no more followers time to join them and log them
(do (spit (str path userid ".edn") (with-out-str (pr (join_followers userid))))
(spit logfile (str userid "\n") :append true) ; adds sucessfully downloaded follower to logfile
(println "downloaded " (str @counter) " left " (str @howmanyleft)
;; (str userid) ; extra feature showing which follower was downloaded
)
(swap! howmanyleft dec)
;; (println " ")
(swap! counter inc)
(delete_partial_files userid))
(recur (+ offset off)))))))
;; sequential method to download all followers of user
(defn get_all_followers_of_users_followers [userid]
(let [already_downloaded (reduce #(conj %1 %2) (hash-set) (re-seq #"\w+" (slurp logfile))) ; reads from file and adds to hash-set
users_dont_eixt (reduce #(conj %1 %2) (hash-set) (re-seq #"\w+" (slurp error_404_list))) ; reads users that don't exist anymore
users_to_filter_out (into already_downloaded users_dont_eixt)
followers_list (mapv (fn [x] ((user_map x) :id)) (read-string (slurp (str path userid ".edn")))) ; reads followers from file
followers_not_yet_downloaded (filter #(not (contains? users_to_filter_out (str %))) followers_list) ; filters followers that are already downloaded or don't exit
_ (println "sequential how many followers: " (count followers_list)
"difference: " (- (count followers_list) (count followers_not_yet_downloaded))
"sequential how many to download: " (count followers_not_yet_downloaded))]
(doall (map get_all_users_followers followers_not_yet_downloaded))))
;; chunked parallel method works faster than simple parallel method but leaves some followers not downloaded
(defn get_all_followers_of_users_followers_chunked [userid]
(let [already_exhausted (reduce #(conj %1 %2) (hash-set) (read-string (slurp (str path1 "already_used.edn"))))]
(if (contains? already_exhausted userid) (println "\n \n Already done \n")
(let [already_downloaded (reduce #(conj %1 %2) (hash-set) (re-seq #"\w+" (slurp logfile)))
users_dont_eixt (reduce #(conj %1 %2) (hash-set) (re-seq #"\w+" (slurp error_404_list)))
users_to_filter_out (into already_downloaded users_dont_eixt)
followers_list (mapv (fn [x] ((user_map x) :id)) (read-string (slurp (str path userid ".edn"))))
followers_not_yet_downloaded (filter #(not (contains? users_to_filter_out (str %))) followers_list)
number_of_followers_to_download (count followers_not_yet_downloaded)
_ (def howmanyleft (atom number_of_followers_to_download))
_ (println "how many followers: " (count followers_list)
"difference: " (- (count followers_list) number_of_followers_to_download)
"how many to download: " number_of_followers_to_download)
partition_size (cond
(> number_of_followers_to_download 2700) 110 ; various chunk sizes depending on how many followers to download
(> number_of_followers_to_download 1800) 80
(> number_of_followers_to_download 1000) 64
(> number_of_followers_to_download 700) 40
(> number_of_followers_to_download 320) 20
(> number_of_followers_to_download 160) 12
(> number_of_followers_to_download 80) 8
(> number_of_followers_to_download 30) 6
(> number_of_followers_to_download 20) 5
(> number_of_followers_to_download 10) 4
(> number_of_followers_to_download 3) 2
:else 1)]
(if (= number_of_followers_to_download 0) (spit (str path1 "already_used.edn") (with-out-str (pr (conj already_exhausted userid))))
(doall (pmap (fn [x] (doall (pmap get_all_users_followers x))) (partition partition_size followers_not_yet_downloaded))))))))
;; function using both sequential and parallel methods to download followers
(defn id_sucker [id]
(let
[already_exhausted (reduce #(conj %1 %2) (hash-set) (read-string (slurp (str path1 "already_used.edn"))))] ;don't download already d/loaded users
(if (contains? already_exhausted id) (println "\n \n Already done \n")
(do
(loop [i 0]
(when (< i 15) ; trying 15 times parallel d/load method
(do (time (get_all_followers_of_users_followers_chunked id))
(recur (inc i)))))
(get_all_followers_of_users_followers id) ; trying 1 time to d/load rest of followers with sequential method
(println "errors " (str @error_counter) "user id" (str id))))))
(defn -main []
(do (println "enter user id or ids: ")
(let [ids (map read-string (re-seq #"\w+" (read-line)))]
(time (doall (map id_sucker ids))))))
(-main) ; main method when calling from terminal enter 140517 , later you can call with multiple uers ids (they need to be already downloaded)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment