Created
January 16, 2016 18:14
-
-
Save sorami/09a5940dfdb8458c1ce9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Finding \"X-Tech\" Candidates" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Get the name of industries\n", | |
| "\n", | |
| "Categories/Pages under [Category:Industries - Wikipedia, the free encyclopedia](https://en.wikipedia.org/wiki/Category:Industries), via DBpedia." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "%%bash\n", | |
| "pip install SPARQLWrapper" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from SPARQLWrapper import SPARQLWrapper, JSON" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "sparql = SPARQLWrapper(\"http://dbpedia.org/sparql\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "sparql.setQuery(\"\"\"\n", | |
| " PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>\n", | |
| " PREFIX skos: <http://www.w3.org/2004/02/skos/core#>\n", | |
| " PREFIX dcterms: <http://purl.org/dc/terms/>\n", | |
| "\n", | |
| " SELECT DISTINCT ?label WHERE {\n", | |
| " {\n", | |
| " {?s dcterms:subject <http://dbpedia.org/resource/Category:Industries>} \n", | |
| " UNION {?s skos:broader <http://dbpedia.org/resource/Category:Industries>}\n", | |
| " } .\n", | |
| " ?s rdfs:label ?label .\n", | |
| " filter langMatches(lang(?label),\"en\")\n", | |
| " }\n", | |
| "\"\"\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "sparql.setReturnFormat(JSON)\n", | |
| "results = sparql.query().convert()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "list_industries = [result[\"label\"][\"value\"] for result in results[\"results\"][\"bindings\"]]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[u'Chemical industry',\n", | |
| " u'Show business',\n", | |
| " u'Companies by arms sales',\n", | |
| " u'Motorsport industry',\n", | |
| " u'Poverty industry',\n", | |
| " u'Leisure industry',\n", | |
| " u'Celebrity\\u2013industrial complex',\n", | |
| " u'Sport industry',\n", | |
| " u'Aircraft industry',\n", | |
| " u'Arms industry',\n", | |
| " u'Professional sports',\n", | |
| " u'Semiconductor industry',\n", | |
| " u'Video game industry',\n", | |
| " u'Electric power industry',\n", | |
| " u'Payment card industry',\n", | |
| " u'Personal care',\n", | |
| " u'Bicycle industry',\n", | |
| " u'Creative industries',\n", | |
| " u'Textile industry',\n", | |
| " u'Professional audiovisual industry',\n", | |
| " u'New manufacturing economy',\n", | |
| " u'Alcohol industry',\n", | |
| " u'Tobacco industry',\n", | |
| " u'Cultural industry',\n", | |
| " u'Information industry',\n", | |
| " u'Activism industry',\n", | |
| " u'Sustainable industries',\n", | |
| " u'Health care industry',\n", | |
| " u'Software industry',\n", | |
| " u'Zu Audio',\n", | |
| " u'Space-based industry',\n", | |
| " u'Radio industry',\n", | |
| " u'Energy industry',\n", | |
| " u'Naval stores industry',\n", | |
| " u'Housing industry',\n", | |
| " u'Converters (industry)',\n", | |
| " u'Entertainment industry',\n", | |
| " u'Commercial item transport and distribution',\n", | |
| " u'Food industry',\n", | |
| " u'Fur trade',\n", | |
| " u'Industrial agriculture',\n", | |
| " u'Metalworking',\n", | |
| " u'Mining',\n", | |
| " u'Real estate',\n", | |
| " u'Service industries',\n", | |
| " u'Shipping',\n", | |
| " u'Plastics industry',\n", | |
| " u'Trade unions by industry',\n", | |
| " u'Companies by industry',\n", | |
| " u'Fishing industry',\n", | |
| " u'Industries by country',\n", | |
| " u'Media industry',\n", | |
| " u'Toy industry',\n", | |
| " u'Automotive industry',\n", | |
| " u'Computer industry',\n", | |
| " u'Cosmetics',\n", | |
| " u'Manufactured goods',\n", | |
| " u'Military industry',\n", | |
| " u'Pharmaceutical industry',\n", | |
| " u'Private spaceflight',\n", | |
| " u'Pulp and paper industry',\n", | |
| " u'Recycling industry',\n", | |
| " u'Businesspeople by industry',\n", | |
| " u'Death care industry',\n", | |
| " u'Industries by city',\n", | |
| " u'Communities by industry',\n", | |
| " u'Space industry',\n", | |
| " u'Electronics industry',\n", | |
| " u'Banking',\n", | |
| " u'Garment industry',\n", | |
| " u'Timber industry',\n", | |
| " u'Transport',\n", | |
| " u'Water industry',\n", | |
| " u'Stock market indices by industry',\n", | |
| " u'Petroleum industry',\n", | |
| " u'Steel industry']" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "list_industries" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Create names" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": { | |
| "collapsed": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "def techize(s):\n", | |
| " return s.replace(\"industry\", \"\").replace(\"industries\", \"\").strip().title().replace(\" \", \"\") + \"Tech\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": { | |
| "collapsed": false | |
| }, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[u'ChemicalTech',\n", | |
| " u'ShowBusinessTech',\n", | |
| " u'CompaniesByArmsSalesTech',\n", | |
| " u'MotorsportTech',\n", | |
| " u'PovertyTech',\n", | |
| " u'LeisureTech',\n", | |
| " u'Celebrity\\u2013IndustrialComplexTech',\n", | |
| " u'SportTech',\n", | |
| " u'AircraftTech',\n", | |
| " u'ArmsTech',\n", | |
| " u'ProfessionalSportsTech',\n", | |
| " u'SemiconductorTech',\n", | |
| " u'VideoGameTech',\n", | |
| " u'ElectricPowerTech',\n", | |
| " u'PaymentCardTech',\n", | |
| " u'PersonalCareTech',\n", | |
| " u'BicycleTech',\n", | |
| " u'CreativeTech',\n", | |
| " u'TextileTech',\n", | |
| " u'ProfessionalAudiovisualTech',\n", | |
| " u'NewManufacturingEconomyTech',\n", | |
| " u'AlcoholTech',\n", | |
| " u'TobaccoTech',\n", | |
| " u'CulturalTech',\n", | |
| " u'InformationTech',\n", | |
| " u'ActivismTech',\n", | |
| " u'SustainableTech',\n", | |
| " u'HealthCareTech',\n", | |
| " u'SoftwareTech',\n", | |
| " u'ZuAudioTech',\n", | |
| " u'Space-BasedTech',\n", | |
| " u'RadioTech',\n", | |
| " u'EnergyTech',\n", | |
| " u'NavalStoresTech',\n", | |
| " u'HousingTech',\n", | |
| " u'Converters()Tech',\n", | |
| " u'EntertainmentTech',\n", | |
| " u'CommercialItemTransportAndDistributionTech',\n", | |
| " u'FoodTech',\n", | |
| " u'FurTradeTech',\n", | |
| " u'IndustrialAgricultureTech',\n", | |
| " u'MetalworkingTech',\n", | |
| " u'MiningTech',\n", | |
| " u'RealEstateTech',\n", | |
| " u'ServiceTech',\n", | |
| " u'ShippingTech',\n", | |
| " u'PlasticsTech',\n", | |
| " u'TradeUnionsByTech',\n", | |
| " u'CompaniesByTech',\n", | |
| " u'FishingTech',\n", | |
| " u'IndustriesByCountryTech',\n", | |
| " u'MediaTech',\n", | |
| " u'ToyTech',\n", | |
| " u'AutomotiveTech',\n", | |
| " u'ComputerTech',\n", | |
| " u'CosmeticsTech',\n", | |
| " u'ManufacturedGoodsTech',\n", | |
| " u'MilitaryTech',\n", | |
| " u'PharmaceuticalTech',\n", | |
| " u'PrivateSpaceflightTech',\n", | |
| " u'PulpAndPaperTech',\n", | |
| " u'RecyclingTech',\n", | |
| " u'BusinesspeopleByTech',\n", | |
| " u'DeathCareTech',\n", | |
| " u'IndustriesByCityTech',\n", | |
| " u'CommunitiesByTech',\n", | |
| " u'SpaceTech',\n", | |
| " u'ElectronicsTech',\n", | |
| " u'BankingTech',\n", | |
| " u'GarmentTech',\n", | |
| " u'TimberTech',\n", | |
| " u'TransportTech',\n", | |
| " u'WaterTech',\n", | |
| " u'StockMarketIndicesByTech',\n", | |
| " u'PetroleumTech',\n", | |
| " u'SteelTech']" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "map(techize, list_industries)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 2", | |
| "language": "python", | |
| "name": "python2" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.10" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment