Skip to content

Instantly share code, notes, and snippets.

@nischalshrestha
Created August 9, 2023 17:56
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nischalshrestha/f89d0dbd709aa5fec68f8da0698fa9f7 to your computer and use it in GitHub Desktop.
Save nischalshrestha/f89d0dbd709aa5fec68f8da0698fa9f7 to your computer and use it in GitHub Desktop.
SQL examples
google-cloud-bigquery==1.8.0
pandas==0.25.1
"""
This script contains functions to query the sotorrent-org.2018_12_09.Posts dump.
NOTE: Please refer to the following instructions installing the Python package
for BigQuery for Python and for setting up BigQuery api token as an environment
variable before running script:
https://cloud.google.com/bigquery/docs/reference/libraries#client-libraries-install-python
"""
from google.cloud import bigquery
import pandas as pd
##################################
# Languages of interest
# source: https://erikbern.com/2017/03/15/the-eigenvector-of-why-we-moved-from-language-x-to-language-y.html
langs = ["java", "c", "c++", "c#", "python", "visual basic", "node",
"perl", "php", "ruby", "go", "swift", "objective c", "cobol", "fortran",
"lua", "scala", "lisp", "haskell", "rust", "erlang", "clojure", "matlab",
"pascal", "r", "kotlin"]
# some languages require the right tag text, this dict maps to the proper text
tag_langs = {
'visual basic': 'vb.net',
'node': 'node.js',
'objective c': 'objective-c',
}
langs.sort()
DIR = '/Volumes/TarDisk/soposts2'
##################################
# Utility functions for language text or language filename
def fix_lang_text(lang):
if lang == 'c':
return r"/\bin c\b/"
elif lang == 'r':
return r"/\bin r\b/"
elif lang == 'go':
return r"/\bin go\b/"
else:
return r"/\b"+lang+r"\b/"
def lang_file_name(lang):
if lang == 'c#':
return 'cs'
elif lang == 'c++':
return 'cpp'
elif lang == 'objective c':
return 'objectivec'
elif lang == 'visual basic':
return 'visualbasic'
return lang
def revert_lang_file_name(lang):
if lang == 'cs':
return 'c#'
elif lang == 'cpp':
return 'c++'
elif lang == 'objectivec':
return 'objective c'
elif lang == 'visualbasic':
return 'visual basic'
return lang
##################################
# Functions for querying from SO
def query_language_pair(a, b, path='./'):
"""
Queries so torrent posts that are:
- taggged with a and b OR tagged with a and has b in the title/body
Saves the csv file locally on current path (default) or given path.
Arguments:
a -- the "source" language
b -- the "target" language
path -- the pathname to save the csv file to
"""
# https://cloud.google.com/bigquery/docs/parameterized-queries#bigquery-query-params-python
btext = fix_lang_text(b)
atag = "%<" + tag_langs[a] + ">%" if a in tag_langs else "%<" + a + ">%"
btag = "%<" + tag_langs[b] + ">%" if b in tag_langs else "%<" + b + ">%"
lang_no_space = lang_file_name(b).replace(' ','')
query = \
f'''
SELECT CONCAT('http://stackoverflow.com/q/', Cast(P.Id as string)) as URL, P.Title, P.ViewCount, P.Score, P.AcceptedAnswerId, P.Tags, P.Body
FROM `sotorrent-org.2018_12_09.Posts` P
WHERE P.PostTypeId = 1 AND (
(P.Tags LIKE '{atag}' AND P.Tags LIKE '{btag}')
OR
(P.Tags LIKE '{atag}' AND (P.Title LIKE '{btext}' OR P.Body LIKE '{btext}'))
)
AND
P.Score >= 0
AND
P.AcceptedAnswerId IS NOT NULL
'''
client = bigquery.Client()
print(f'Querying posts for pair <{a}, {b}>...')
df = client.query(
query,
location='US').to_dataframe() # API request - starts the query
a_filename = lang_file_name(a)
b_filename = lang_file_name(b)
print('Saving...')
df.to_csv(f'{path}/{a_filename}_{b_filename}.csv', index=False)
def build_sql_expr(a):
"""
Builds and chains the SELECT queries
Arguments:
a -- the source/previous language
"""
sql_expr = """SELECT * FROM\n\n"""
# in this instance we only set atag once as we're writing a query for
# <a, b> where b is all the other target languages
atag = "%<" + a + ">%"
if a in tag_langs:
atag = "%<" + tag_langs[a] + ">%"
for i, l in enumerate(langs):
btext = fix_lang_text(l)
btag = "%<" + l + ">%"
if l in tag_langs:
btag = "%<" + tag_langs[l] + ">%"
lang_no_space = lang_file_name(l).replace(' ','')
if l != a:
sql_expr += \
f'''
(SELECT COUNT(distinct P.Id) AS {lang_no_space}
FROM `sotorrent-org.2018_12_09.Posts` P
WHERE P.PostTypeId = 1 AND (
(P.Tags LIKE '{atag}' AND P.Tags LIKE '{btag}')
OR
(P.Tags LIKE '{atag}' AND (P.Title LIKE '{btext}' OR P.Body LIKE '{btext}'))
)
AND
P.Score >= 0) AS {lang_no_space}'''
# since visual basic is the last in the list, this makes sure that
# we don't add a comma and \n after the swift SELECT
if a == 'visual basic':
if l == 'swift':
continue
if i != len(langs) - 1:
sql_expr += ',\n'
return sql_expr
def query_all():
"""
Queries all pairs and downloads a csv containing counts for each pair
in a folder called langcounts in current directory
"""
for l in langs:
print(f'Querying for {l}...')
sql_expr = build_sql_expr(l)
client = bigquery.Client()
# API request - starts the query and converts to df
df = client.query(sql_expr).to_dataframe()
csv_name = lang_file_name(l)
# TODO don't hardcode the root path
df.to_csv(f'langcounts/{csv_name}.csv', index=False)
def query_pairs():
'''
Query sepecific language pairs
These are based on pairs resulting from our stop rule criteria in paper:
c cpp
cs visualbasic
clojure java
java cs
kotlin java
lua cpp
matlab python
node php
objectivec swift
perl python
php java
python cpp
r python
ruby python
scala java
'''
# TODO have a way to pass in a list of previous to target languages instead
# of hardcoding.
prev = ['c', 'cs', 'clojure', 'java', 'kotlin', 'lua', 'matlab', 'node',
'objectivec', 'perl', 'php', 'python', 'r', 'ruby', 'scala']
target = ['cpp', 'visualbasic', 'java', 'cs', 'java', 'cpp', 'python', 'php',
'swift', 'python', 'java', 'cpp', 'python', 'python', 'java']
for p, t in zip(prev, target):
p = revert_lang_file_name(p)
t = revert_lang_file_name(t)
print(f'Saving posts for <{p}, {t}>...')
query_language_pair(p, t, path=DIR)
if __name__ == '__main__':
"""
TODO accept arguments for either querying all combinations or specific
language pairs
"""
# or query for posts
# Uncomment this to query SO posts for languages in lang
query_all()
# Uncomment this to query SO posts for specific language pairs in lang
# query_pairs()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment