Skip to content

Instantly share code, notes, and snippets.

@zackster
Created October 19, 2017 04:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save zackster/b9e0cc87df34218ba9d92454c830945b to your computer and use it in GitHub Desktop.
Save zackster/b9e0cc87df34218ba9d92454c830945b to your computer and use it in GitHub Desktop.
Normalize Code For Cash programming job corpus
import csv
import string
import re
programming_keywords = set([".net", ".net core", "3d", "abap", "active directory", "activecampaign", "admob", "adobe photoshop", "adtech", "adwords", "agile", "ai", "airflow", "aix", "ajax", "akka", "albanian", "alexa", "alexa skills", "alfresco", "algorithms", "amazon web services", "amazon-web-services", "ambari", "analytics", "andriod app development", "android", "androidstudio", "angular", "angular 2", "angular.js", "angular2", "angularjs", "animation", "ansible", "ant", "apache", "apache spark", "apache2", "api", "app", "app development", "app extensions", "apple", "application", "apps", "architect", "architecture", "arduino", "arm", "artificial intelligence", "asp.net", "asp.net mvc", "asp.net web api", "asp.net web forms", "assembly", "augmented reality", "aurelia", "aureliajs", "austin", "automate", "automation", "avada", "avr", "aws", "azure", "babel", "back end", "back-end", "backbone", "backbonejs", "backend", "backend developer", "bash", "beaglebone", "beautifulsoup", "beginner", "berkeleydb", "bi", "big data", "bigdata", "bigmemorygo", "bigquery", "bitcoin", "blockchain", "blocks", "blogging", "blueprintjs", "bluetooth", "boo", "boot", "bootstrap", "bot", "bpm", "braintree", "browser", "bt", "bte", "btle", "business continuity", "business intelligence", "byond", "c", "c#", "c++", "c9", "cakephp", "cannabis", "canvas", "carthage", "cassandra", "cc1310", "centos", "chat bot", "chatbot", "cheap", "chef", "ci", "circuit", "claims adjust", "clojure", "clojurescript", "cloud", "cloud functions", "cloudfoundry", "cms", "coaching", "cobol", "cocoapods", "codeigniter", "codeship", "coffeescript", "cofounder", "commerce", "communicating-sequential-processes", "content-marketing", "continuous integration", "continuum", "contract", "contractual", "copywriting", "coq", "cordova", "core data", "core location", "coredata", "coreos", "crawl", "crawler", "critical-thinking", "cron", "cross platform", "cross-platform", "crypto-currency", "cryptocurrency", "cryptography", "csharp", "csp", "css", "css3", "cto", "cucumber", "d3", "d3.js", "d3js", "daemontools", "data", "data analy", "data analysis", "data analyst", "data analytics", "data architecture", "data forensics", "data mining", "data science", "data scientist", "data visual", "data visualization", "data warehouse", "database", "database architecture", "databases", "dataviz", "datomic", "debian", "deep learning", "delevoper", "delphi", "demandware", "deployment", "design", "design agency", "desktop", "dev", "develop", "developer", "development", "devops", "digital marketing", "digital ocean", "digital transformation", "digitalmarketing", "digitalocean", "disaster recovery", "dispensary", "distributed", "divi", "django", "djangorestframework", "djbdns", "docker", "docker swarm", "dot net", "dotnet", "dovecot", "dreamweaver", "drupal", "dustjs", "dw", "dynamodb", "e-commmerce", "ebs", "ec2", "ecmascript", "ecommerce", "ejb", "elastic beanstalk", "elastic search", "elasticity", "elasticsearch", "elearning", "electrical", "electron", "elixir", "elm", "emacs", "embedded", "ember", "ember.js", "emberjs", "end", "engineering", "entity framework", "entry-level", "erlang", "es2015", "es2016", "es2017", "es6", "es7", "es8", "ethereum", "etl", "etl scripts", "eventscalendar", "excel", "experience", "express", "express.js", "expressjs", "extjs", "f#", "facebook", "facebook ads", "fiori", "firebase", "firmware", "flask", "flexbox", "flux", "foundation", "framework", "free lance", "freebsd", "freelance", "front", "front end", "front-end", "front-end developer", "frontend", "frp", "full stack", "full-stack", "fullstack", "functional programming", "game development", "game programming", "game servers", "gamedev", "gamemaker", "ganglia", "garden", "gce", "generation", "generative", "geoserverjavaee", "geotools", "gis", "git", "git hub", "github", "glassfish", "go", "golang", "google analytics", "google maps", "graphic design", "grunt", "gtk", "gulp", "hack", "hadoop", "halogen", "hanami", "haproxy", "hardware", "haskell", "haxe", "hbase", "hdfs", "healthcare", "heroku", "hibernate", "high availability", "hive", "hmi", "home warranty", "hstore", "htm", "html", "html/css", "html5", "hybrid", "ibatis", "iconography", "idms", "illustrator", "image processing", "infinispan", "infrastructure-as-code", "innovator", "interactive brokers", "interface", "intern", "internet of things", "ionic", "ionic 2", "ios", "iot", "ipad", "iphone", "iplanet", "ipsec", "ipython", "j2ee", "j2me", "jasper", "java", "java script", "java spring rest hibernate", "java8", "javaee", "javamail", "javascript", "jaxrs", "jboss", "jcl", "jdbc", "jdo", "jekyll", "jenkins", "jmeter", "joomla", "jpa", "jquery", "jquery mobile", "js", "jsp", "junior", "junit", "jupyter notebook", "kafka", "kicad", "kindle", "kiosk", "knockout", "knockoutjs", "kockout", "kockout.js", "kotlin", "kubernetes", "lambda", "lamp", "landing", "laravel", "ldap", "leaflet", "legacy", "less", "leveldb", "liferay", "linq", "linux", "lisp", "listify", "load", "logi", "logo design", "mac", "machine learning", "macos", "magento", "magnolia", "mailchimp", "mailfront", "mailgun", "maintenance", "maker", "mapreduce", "maria", "marketing", "marketing agency", "material", "material design", "materialize", "matlab", "maven", "mean", "mean.js", "messenger bot", "meteor", "meteor.js", "microservice", "microservices", "middleware", "minimal", "minimum viable product", "ml", "mobile", "mobile app", "mobile app development", "mobx", "mock ups", "mock-ups", "mockups", "mongo", "mongodb", "moodle", "mssql", "mvc", "mvc5", "mvp", "mysql", "nagios", "native", "neo4j", "neural networks", "nginx", "nixos", "nlp", "node", "node js", "node.js", "nodejs", "non profit", "non-profit", "nonprofit", "noo", "nosql", "npm", "nsq", "numenta", "numpy", "nunit", "nxp", "objc", "objective c", "objective-c", "objectivec", "oc4j", "odoo", "office", "office.js", "oled", "om-next", "onsen", "onsenui", "openbsd", "openfl", "openlayers", "openstack", "oracle", "orientdb", "osx", "otp", "ovh", "packer", "paid search", "pandas", "part time", "part-time", "pascal", "pcb", "performance-marketing", "perl", "pgrounting", "phalcon", "phantomjs", "phoenix", "phonegap", "photoshop", "php", "php7", "pig", "pl/sql", "play", "plugin", "portfolio", "postgis", "postgres", "postgresql", "pouchdb", "power", "powerbi", "powerpc", "ppc", "ppc advertising", "predictive analytics", "process improvement", "project management", "promises", "prototype", "prototyping", "puppet", "purescript", "pyramid", "python", "qa", "qmail", "qnx", "quant", "quantitative finance", "qunit", "r", "r studio", "rabbitmq", "rails", "ramda", "raspberry pi", "ravendb", "rblsmtpd", "rds", "re-frame", "react", "react js", "react mobile", "react native", "react-native", "react.js", "reactivecocoa", "reactjs", "reactnative", "reagent", "realm", "red hat", "redhat", "redis", "redshift", "redux", "reflex-frp", "regex", "relational", "relational database", "remote", "repl-driven-development", "report", "reporting", "responsive", "responsive design", "rest", "rest api", "rest assured", "rest-assured", "restful", "restify", "reverse engineer", "rf", "rhel", "ror", "rspec", "ruby", "ruby on rails", "rudy", "rust", "rwd", "rxjs", "rxswift", "s3", "saas", "sailsjs", "salesforce", "salesforce.com", "saltstack", "sap", "sapui5", "sass", "sbt", "scada", "scala", "scalability", "scenekit", "science", "scikit", "scikit learn", "scikit-learn", "scrape", "scraper", "scraping", "scrapy", "scripting", "scrum", "scss", "search", "security", "selenium", "sencha", "senior", "seo", "sequelize", "servant", "server", "serverless", "servlet", "shell", "shell script", "shopify", "sikulix", "simplexportal", "sinatra", "sitara", "sketch", "slim", "sms", "smtp", "soc", "social entrepreneur", "social framework", "socialmediamarketing", "software", "software development", "spark", "spider", "spring", "spring boot", "sqf", "sql", "sql server 2014", "sqlite", "sqlserver", "sqoop", "sqs", "squarespace", "ssas", "ssis", "ssrs", "start up", "startup", "static", "statistics", "stripe", "structural engineering", "struts", "sublime", "svr4", "swift", "swing", "symfony", "sysadmin", "systems", "t-sql", "tableau", "targeted advertising", "taxonomy", "technical lead", "technical support", "teletype", "tensorflow", "terminal", "terraform", "testing", "testng", "texas instruments", "tft", "theme", "themeforest", "theory", "three", "three.js", "threejs", "ti", "titan", "tomcat", "touchscreen", "training", "travis ci", "travisci", "ts", "tsql", "tvos", "twilio", "twisted", "typescript", "typography", "ubuntu", "ui", "ui testing", "ui/ux", "uikit", "unit testing", "unity", "unity3d", "unix", "user", "user experience", "user interface", "uwp", "ux", "vb", "vb.net", "vba", "vertx", "video", "video editing", "video editor", "virtual it", "visual studio", "visual studio 2015", "visualcomposer", "vr", "vue", "vue-js", "vue.js", "vuejs", "vuex", "watchos", "watson", "wcf", "web", "web accessibility", "web app", "web app development", "web application", "web design", "web dev", "web development", "web-design", "web-developer", "webapi", "webapp", "webdesign", "webdevelopment", "webgl", "weblogic", "webpack", "website", "website design", "website development", "website revamp", "websites", "websockets", "websphere", "webstart", "weebly", "wifi", "windows", "windows client", "windows services", "winform", "wire frames", "wireframes", "wireframing", "wix", "woocommerce", "wordpress", "workflow", "worldpress", "wpengine", "wpf", "writing", "xamarin", "xcode", "xhtml", "xml", "xmlrpc", "youtube", "zend", "zookeeper"])
def normalize(text):
# remove punctuation
text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
# split into words
from nltk.tokenize import word_tokenize
tokens = word_tokenize(text)
# convert to lower case
tokens = [w.lower() for w in tokens]
# remove remaining tokens that are not alphanumeric
words = [word for word in tokens if word.isalnum()]
# filter out stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
words = [w for w in words if not w in stop_words]
return ' '.join(words)
with open("gom_table.csv", 'rU') as tsv:
for line in csv.reader(tsv, delimiter="\t"):
if len(line) < 14:
continue
text = normalize(line[7] + ' ' + line[8])
employment_type = line[10]
remote_ok = line[11]
time_commitment = line[12]
not_tech = line[13]
labels = []
if (not_tech == '1'):
if len(set('text'.split()) & programming_keywords) > 1:
continue
else:
label_str = '__label__not-programming'
else:
labels = []
if (remote_ok == 'remote_ok') or (remote_ok == 'remote_not_ok'):
labels.append(remote_ok.replace('_', '-'))
if (time_commitment == 'fulltime') or (time_commitment == 'parttime') or (time_commitment == 'project'):
labels.append(time_commitment)
if (employment_type == 'fte') or (employment_type == 'contract') or (employment_type == 'internship') or (employment_type == 'either'):
labels.append(employment_type)
labels = ["__label__{}".format(label) for label in labels]
if(len(labels)) > 0:
label_str = ' '.join(labels)
else:
continue
print "{} {}".format(label_str, text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment