Skip to content

Instantly share code, notes, and snippets.

@Ebrahim-Ramadan
Last active December 21, 2023 07:32
Show Gist options
  • Save Ebrahim-Ramadan/82b4fc3365a4c89cafea83aa36dfaad9 to your computer and use it in GitHub Desktop.
Save Ebrahim-Ramadan/82b4fc3365a4c89cafea83aa36dfaad9 to your computer and use it in GitHub Desktop.
paper-citation-project
import time
from collections import defaultdict
# readign nodes data from nodes.csv
def read_nodes(file):
with open(file, 'r', encoding='utf-8') as f:
lines = f.readlines()[1:] # Skip header
nodes = {}
for line in lines:
parts = line.strip().split(',')
nodes[parts[0]] = parts[1]
return nodes
# read edges data from edges.csv
def read_edges(file):
with open(file, 'r', encoding='utf-8') as f:
lines = f.readlines()[1:] # Skip header
edges = []
for line in lines:
parts = line.strip().split(',')
edges.append((parts[0], parts[1]))
return edges
# retrieving highest cited papers
def highest_cited_papers(nodes, edges):
citation_count = {}
for edge in edges:
citation_id = edge[1]
citation_count[citation_id] = citation_count.get(citation_id, 0) + 1
sorted_papers = sorted(citation_count.items(),
key=lambda x: x[1], reverse=True)
highest_cited = []
for paper_id, _ in sorted_papers:
if paper_id in nodes:
highest_cited.append((paper_id, nodes[paper_id]))
return highest_cited
#o find closest group of papers
def closest_group_of_papers(edges, threshold):
graph = defaultdict(set)
for edge in edges:
paper_id, citation_id = edge
graph[paper_id].add(citation_id)
graph[citation_id].add(paper_id)
closest_group = []
processed = set()
for start_node in graph:
if start_node not in processed:
stack = [start_node]
group = set()
while stack:
node = stack.pop()
if node not in processed:
processed.add(node)
group.add(node)
stack.extend(graph[node] - processed)
if len(group) > 1:
closest_group.append(list(group))
return [group for group in closest_group if len(group) >= threshold]
nodes_file = 'nodes.csv'
edges_file = 'edges.csv'
start_time = time.time()
nodes_data = read_nodes(nodes_file)
edges_data = read_edges(edges_file)
highest_cited = highest_cited_papers(nodes_data, edges_data)
print("Highest cited papers:")
for paper_id, paper_title in highest_cited[:10]:
print(f"Paper ID: {paper_id}, Title: {paper_title}")
threshold_value = 5 # You may adjust the threshold value
closest_group = closest_group_of_papers(edges_data, threshold_value)
print(
f"\nclosest group of papers with at least {threshold_value} common citations:")
print(closest_group)
end_time = time.time()
execution_time = end_time - start_time
print(f"\nExecution time: {execution_time} s")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment