Skip to content

Instantly share code, notes, and snippets.

@glorphindale
Created March 19, 2014 16:56
Show Gist options
  • Save glorphindale/9646119 to your computer and use it in GitHub Desktop.
Save glorphindale/9646119 to your computer and use it in GitHub Desktop.
Преобразования данных
(ns enlive-examples.process
(:require [clojure.string :as string]
[cheshire.core :as chesh]
[clojure.pprint :as pp]))
(def data (chesh/parse-string (slurp "codefest-2013-raw.json") true))
;; Sex
(defn person->name [person]
(-> person
first
(string/split #" " 2)
first
string/lower-case))
(def all-names (set (map person->name data)))
(count all-names)
(defn name->sex [name]
(let [letter (last name)]
(cond
(#{"данила" "тема" "илья" "тёма" "гриша" "никита" "юра" "nikita" "саша" "женя"} name) [name "m"]
(#{\a \а \я} letter) [name "f"]
(#{\н \й \р \r \n \л \с \в \п \д \x \w \m \s \y \l \м \т \ь \о \г \б \к \i \d \e \k} letter) [name "m"]
:default [name "?"]
)))
(defn person->sex [person]
(-> person person->name name->sex))
(frequencies (map second (map name->sex all-names)))
;; Positions
(defn person->position [person]
(-> person
(#(if (nth % 2) (nth % 2) ""))
string/lower-case))
(def all-positions (set (map person->position data)))
(count all-positions)
(defn string-contains? [s vars]
(seq (filter true? (map #(.contains s %) vars))))
(defn simplify-position [position]
(cond
(string-contains? position #{"hr" "персонал" "алексей сухоруков" "людям"}) [position "hr"]
(string-contains? position #{"дизайнер" "ui" "интерф" "designer" "ux"}) [position "designer"]
(string-contains? position #{"director" "директор" "manager" "начальник" "pm" "cio"
"leader" "менеджер" "руководит" "lead" "лидер" "cto" "boss"
"рук." "владелец" "ceo" "главный" "лид" "chief" "пм" "mgr"
"управля" "гендир" "vp" "соучре" "управл" "coo" "партнер" "head"}) [position "mgmt"]
(string-contains? position #{"qa" "тестирован" "качеств" "test" "тестировщик" "поняша"
"тестер" "sdet"}) [position "qa"]
(string-contains? position #{"аналитик" "архи" "architect"}) [position "analysis"]
(string-contains? position #{"developer" "разработчик" "программист" "програмист"
"engineer" "rnd" "инженер" "program"
"android" "java" "scala" "javascript" "sde"}) [position "developer"]
(string-contains? position #{"админ" "admin" "devops"}) [position "admin"]
(string-contains? position #{"студент"}) [position "student"]
:default [position "na"]
))
(frequencies (map second (map simplify-position all-positions)))
(frequencies (map identity (map simplify-position all-positions)))
(filter #(= (second %) "na") (map simplify-position all-positions))
(defn person->simple-position [person]
(-> person person->position simplify-position))
;; Companies
(def all-companies (map second data))
(defn simplify-company [company]
(cond
(#{"2gis"} company) "2гис"
(#{"ооо «компания холидей»"} company) "ооо \"компания холидей\""
(#{"playtox llc"} company) "playtox"
(#{"новео"} company) "noveo"
(#{"Кадровое Агентство Алексея Сухорукова"} company) "Alexey Suhorukov's Recruitment Agency"
:default company))
(defn bin-company [[c f]]
(cond
(< 0 f 2) {c "1"}
(< 1 f 6) {c "2-5"}
(< 5 f 11) {c "6-10"}
(< 10 f 16) {c "11-15"}
(< 15 f 21) {c "16-20"}
(< 20 f 26) {c "21-25"}
(< 25 f 51) {c "26-50"}
(< 50 f) {c "50+"}))
(def company-freqs
(->> all-companies
(map simplify-company)
frequencies
(map bin-company)
(apply merge)))
(defn person->company-size [person]
(-> person second simplify-company company-freqs))
;; Bring it all together
(defn transform-person [person]
[(-> person person->sex second) (person->company-size person) (-> person person->simple-position second)])
(def transformed-data (map transform-person data))
(def freqs (frequencies transformed-data))
(defn finalize [[[sex company position] v]]
{"sex" sex "company" company "position" position "amount" v})
(def result
(str "var raw_data ="
(chesh/generate-string
(map finalize freqs))
";"))
(spit "codefest-2013.json" result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment