faizankshaikh/check_code_present

## check_code_present
code_present

## check_dirs
root = "."
dirlist = [item for item in os.listdir(root) if os.path.isdir(os.path.join(root, item))]
print(dirlist)

## check_num_dir
len(dirlist)

## check_top_github_users
code_links_df.loc[code_links_df.domains == "github.com"].links.apply(
    lambda x: urlparse(x)[2].split("/")[1]
).value_counts().head(10)

## clean_repo_links


temp_link = ""


def clean_github_link(link):
    link = link.strip()
    if not link[-4:] == ".git":
        return link + ".git"
    else:
        return link


github_repo_links = (
    code_links_df.loc[code_links_df.domains == "github.com"]
    .links.apply(clean_github_link)
    .values
)

## clean_tools_names
def cleaner(tool_list):
    cleaned_list = ""
    try:
        cleaned_list = []
        for tool in tool_list:
            cleaned_tool = re.findall("^\w+", tool)
            if not cleaned_tool:
                pass
            else:
                cleaned_list.append(cleaned_tool[0])
        cleaned_list = ",".join(cleaned_list)
        return cleaned_list
    except:
        tool_list = ",".join(tool_list)
        "unclean_list".join(tool_list)
        return tool_list


all_tools["all_tool_names_cleaned"] = all_tools.all_tool_names.str.split(",").apply(
    cleaner
)

## create_link_df
code_links_df = pd.DataFrame({"links": code_links})

## download_github_repos
# takes about 24 minutes to download
for link in github_repo_links:
    !git clone $link --depth 1 --quiet

## find_sucessfully_parsed
has_req_cnt = no_req_cnt = 0
for repo in dirlist:
    path = "/content/" + repo
    if os.path.exists(path + "/requirements.txt"):
        has_req_cnt += 1
    else:
        no_req_cnt += 1

## generate_requirements
# takes about 10 minutes to run
for repo in dirlist:
    path = "/content/" + repo
    if os.path.exists(path + "/requirements.txt"):
        pass
    else:
        !pipreqs $path

## get_code_links
code_present = 0
code_links = []
for note in accepted_submissions:
    try:
        code_links.append(note.content["code"])
        # print("code found")
        code_present += 1
    except:
        print("Unexpected error:", sys.exc_info()[0])

## get_domains
code_links_df["domains"] = code_links_df.links.apply(lambda x: urlparse(x)[1])

## get_num_unique_tools
all_tools.all_tool_names_cleaned.str.split(",", expand=True).stack().unique().shape

## get_score_gym
give_score("gym")

## get_score_networkx
give_score("networkx")

## get_score_tensorboard
give_score("tensorboard")

## get_score_torch_tf_keras
give_score("torch")
print()
give_score("tensorflow", offset=12)
print()
give_score("keras")

## get_score_transformers
give_score("transformers")

## get_tool_count
all_tools.all_tool_names_cleaned.str.contains("torch").sum()

## give_score_func
def give_score(tool_name, offset=0):
    num = all_tools.all_tool_names_cleaned.str.contains(tool_name).sum()
    num += offset
    print(
        "Count of {} is {} and total usage is {}%".format(
            tool_name, num, round((num / (all_tools.shape[0]+offset)) * 100, 4)
        )
    )

## imports
%matplotlib inline

import os
import re
import sys
import requests
import openreview
import pandas as pd
import matplotlib.pyplot as plt

from random import choice
from wordcloud import WordCloud
from urllib.parse import urlparse

## installation
!pip install openreview-py
!pip install pipreqs

## load_all_tools_df
all_tools = pd.read_csv("all_tools.csv")

## make_all_tools_df
all_tools = pd.DataFrame(
    {"all_repo_names": all_repo_names, "all_tool_names": all_tool_names}
)
all_tools.head()

## openreview_api
client = openreview.Client(baseurl="https://openreview.net")

blind_notes = {
    note.id: note
    for note in openreview.tools.iterget_notes(
        client,
        invitation="ICLR.cc/2020/Conference/-/Blind_Submission",
        details="original",
    )
}

all_decision_notes = openreview.tools.iterget_notes(
    client, invitation="ICLR.cc/2020/Conference/Paper.*/-/Decision"
)

accepted_submissions = [
    blind_notes[decision_note.forum]
    for decision_note in all_decision_notes
    if "Accept" in decision_note.content["decision"]
]

len(accepted_submissions)

## plot_top_10_tools
all_tools.all_tool_names_cleaned.str.split(",", expand=True).stack().value_counts()[
    :10
].plot(kind="bar")

## print_head_all_tools_df_1
all_tools.head()

## print_head_all_tools_df_2
all_tools.head()

## print_links_value_counts
code_links_df.domains.value_counts()

## print_num_parsed
has_req_cnt, no_req_cnt

## print_random_link
urlparse(choice(code_links))

## print_shape_all_tools_df
all_tools.shape

## print_top_50_tools
all_tools.all_tool_names_cleaned.str.split(",", expand=True).stack().value_counts()[:50]

## read_tools
all_repo_names = []
all_tool_names = []
for repo in dirlist:
    try:
        repo_name = repo
        with open("/content/" + repo + "/" + "requirements.txt", "r") as f:
            tools = f.readlines()
        tool_names = ",".join(tools).lower()

        all_repo_names.append(repo_name)
        all_tool_names.append(tool_names)
    except:
        print("Unexpected error for ", repo, sys.exc_info()[0])

## read_tools_file
all_tools = pd.read_csv("all_tools.csv")

## remove_extra_dirs
dirlist.remove('.config')
dirlist.remove('sample_data')

## save_all_tools_df
all_tools.to_csv("all_tools.csv", index=False)

## wordcloud_top_100_tools
all_tool_string = ",".join(all_tools.all_tool_names_cleaned)

wordcloud = WordCloud(background_color="white", max_words=100)
wordcloud.generate(all_tool_string)

plt.figure(figsize=(10, 20))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
	root = "."
	dirlist = [item for item in os.listdir(root) if os.path.isdir(os.path.join(root, item))]
	print(dirlist)
	code_links_df.loc[code_links_df.domains == "github.com"].links.apply(
	lambda x: urlparse(x)[2].split("/")[1]
	).value_counts().head(10)


	temp_link = ""


	def clean_github_link(link):
	link = link.strip()
	if not link[-4:] == ".git":
	return link + ".git"
	else:
	return link


	github_repo_links = (
	code_links_df.loc[code_links_df.domains == "github.com"]
	.links.apply(clean_github_link)
	.values
	)
	def cleaner(tool_list):
	cleaned_list = ""
	try:
	cleaned_list = []
	for tool in tool_list:
	cleaned_tool = re.findall("^\w+", tool)
	if not cleaned_tool:
	pass
	else:
	cleaned_list.append(cleaned_tool[0])
	cleaned_list = ",".join(cleaned_list)
	return cleaned_list
	except:
	tool_list = ",".join(tool_list)
	"unclean_list".join(tool_list)
	return tool_list


	all_tools["all_tool_names_cleaned"] = all_tools.all_tool_names.str.split(",").apply(
	cleaner
	)
	# takes about 24 minutes to download
	for link in github_repo_links:
	!git clone $link --depth 1 --quiet
	has_req_cnt = no_req_cnt = 0
	for repo in dirlist:
	path = "/content/" + repo
	if os.path.exists(path + "/requirements.txt"):
	has_req_cnt += 1
	else:
	no_req_cnt += 1
	# takes about 10 minutes to run
	for repo in dirlist:
	path = "/content/" + repo
	if os.path.exists(path + "/requirements.txt"):
	pass
	else:
	!pipreqs $path
	code_present = 0
	code_links = []
	for note in accepted_submissions:
	try:
	code_links.append(note.content["code"])
	# print("code found")
	code_present += 1
	except:
	print("Unexpected error:", sys.exc_info()[0])
	give_score("torch")
	print()
	give_score("tensorflow", offset=12)
	print()
	give_score("keras")
	def give_score(tool_name, offset=0):
	num = all_tools.all_tool_names_cleaned.str.contains(tool_name).sum()
	num += offset
	print(
	"Count of {} is {} and total usage is {}%".format(
	tool_name, num, round((num / (all_tools.shape[0]+offset)) * 100, 4)
	)
	)