Skip to content

Instantly share code, notes, and snippets.

@imankulov
Last active August 19, 2020 12:47
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save imankulov/9b7ea86059aa2c1048d003e90541d807 to your computer and use it in GitHub Desktop.
Save imankulov/9b7ea86059aa2c1048d003e90541d807 to your computer and use it in GitHub Desktop.
Analyze the percentage of the topic-specific StackOverflow questions for each programming language
#!/usr/bin/env python
"""
An ad-hoc script to analyze the percentage of the topic-specific questions for
each programming language.
For each language, the script downloads two pages and looks for the number of questions
in the title, like "NNN,NNN questions". For example, for "python" and
an extra tag "architecture", the script downloads
https://stackoverflow.com/questions/tagged/python and
https://stackoverflow.com/questions/tagged/python+architecture.
The "Tag Popularity" value is the number of tag-specific questions per 1000 of
questions in general.
The script saves the output in a CSV file with the name of the tag.
The output for "architecture.csv" is available here: https://chart-studio.plotly.com/~RomanImankulov/55
"""
import re
import sys
import requests
from lxml.etree import HTML
import pandas as pd
from urllib.parse import quote
language_tags = [
"c#",
"java",
"c++",
"python",
"ruby",
"javascript",
"php",
"go",
"rust",
"kotlin",
"swift",
]
def analyze_languages(languages, secondary_tag):
raw_data = [analyze_language(lang, secondary_tag) for lang in languages]
return pd.DataFrame.from_records(raw_data)
def analyze_language(language, secondary_tag):
questions = get_question_count([language])
questions_with_tag = get_question_count([language, secondary_tag])
return {
"language": language,
"questions": questions,
"question_with_tag": questions_with_tag,
"tag_popularity": questions_with_tag * 1000 / questions,
}
def get_question_count(tags):
"""Return the number of questions, tagged with provided tags."""
formatted_tags = quote(" ".join(tags))
resp = requests.get(f"https://stackoverflow.com/questions/tagged/{formatted_tags}")
tree = HTML(resp.text).getroottree()
xpath_selector = '//div[@id="mainbar"]//div[contains(@class, "mr12")]'
count_text = tree.xpath(xpath_selector)[0].text.strip()
count = int("".join(re.findall(r"\d+", count_text)))
return count
if __name__ == "__main__":
secondary_tag = sys.argv[1]
df = analyze_languages(language_tags, secondary_tag)
df.to_csv(f"{secondary_tag}.csv", index=False)
language questions question_with_tag tag_popularity
c# 1429283 4751 3.324044293537389
java 1705257 13349 7.828145552254
c++ 687722 10857 15.786902265741098
python 1514427 9523 6.288186885204767
ruby 214977 813 3.7817999134791163
javascript 2070667 4616 2.2292333822869637
php 1366946 2128 1.5567549852005858
go 48037 174 3.622207881424735
rust 17761 35 1.970609762963797
kotlin 44392 48 1.081275905568571
swift 275312 288 1.0460858952751788
language questions question_with_tag tag_popularity
c# 1429343 1943 1.3593658065278942
java 1705327 1420 0.8326848751002007
c++ 687757 407 0.5917787823315502
python 1514603 315 0.20797529121492564
ruby 214981 96 0.4465510905614915
javascript 2070781 519 0.25063007628522765
php 1367002 519 0.3796629412392959
go 48045 47 0.9782495577063169
rust 17763 7 0.3940775769858695
kotlin 44399 39 0.8783981621207685
swift 275332 98 0.3559339270408089
language questions question_with_tag tag_popularity
c# 1429280 5090 3.561233628120452
java 1705254 6490 3.805884636540949
c++ 687722 2237 3.2527678335141235
python 1514423 884 0.5837206645699385
ruby 214978 226 1.0512703625487259
javascript 2070668 1591 0.7683510828389679
php 1366947 1566 1.1456186670002568
go 48037 64 1.332306347190707
rust 17761 20 1.1260627216935983
kotlin 44391 60 1.351625329458674
swift 275312 216 0.784564421456384
language questions question_with_tag tag_popularity
c# 1429280 3391 2.3725232284786744
java 1705254 5722 3.355511847501897
c++ 687723 474 0.6892309839862851
python 1514424 1049 0.6926725936725778
ruby 214978 277 1.2885039399380402
javascript 2070668 3328 1.6072108131289033
php 1366947 7497 5.48448476788054
go 48037 69 1.4363927805649812
rust 17761 8 0.45042508867743936
kotlin 44392 22 0.4955847900522617
swift 275312 178 0.6465391991631313
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment