CS7280 Assignment 1.py

#!/usr/bin/env python
# coding: utf-8

# # Assignment 1: Getting Started with NetworkX
# ## Spring 2024

import matplotlib.pyplot as plt
import networkx as nx
import seaborn as sns
import numpy as np
import scipy as sp
from typing import Tuple, List, Dict, Union

# ## Part 1- Intro to NetworkX [20 points]
# ### Generating graphs with NetworkX

#Implement your code and show the visualization

def practice_graphs() -> None:

Nothing, but draws the visualizations of the graphs.

# This line is a placeholder
G = nx.Graph()
nx.draw(G, with_labels=True)

# Write a function that returns the 3 10-node toy networks from the assignment pdf

def create_toy_graphs() -> Tuple[nx.Graph, nx.Graph, nx.Graph]:

cycle: a networkx graph object meeting the requirements of a cycle
clique: a networkx graph object meeting the requirements of a clique
star: a networkx graph object meeting the requirements of a star network

#These lines are placeholders
cycle = nx.Graph()
clique = nx.Graph()
star = nx.Graph()

return cycle, clique, star

# Write a function that will return the leading eigenvalue of the adjacency matrix of a given network.

def calculate_leading_eigenvalue(G) -> np.float64:
G: NetworkX graph object

eig (float): leading eigenvalue of the adjacency matrix

# This is a placeholder
eig = 7280.00

return eig

# What is the relationship between the eigenvalue of the adjacency matrix, the maximum degree, and average degree of each network?

# ### Importing Map Data

# write a function to load cities_data.graphml using built in NetworkX methods
def load_cities_data() -> nx.Graph:
G: NetworkX Graph object

# this is a placeholder, load the city data into a graph, G

# find the number of nodes in the graph
def find_number_of_nodes(G) -> int:
G: NetworkX Graph object

num_nodes (int): number of nodes in G

# this is a placeholder, find the number of nodes in G
num_nodes = 0

return num_nodes

# find the number of edges in the graph
def find_number_of_edges(G) -> int:
G: NetworkX Graph object

num_edges (int): number of edges in G

# this is a placeholder, find the number of edges in G
num_edges = 0

return num_edges

# find the number of city pairs that are less than 50 miles apart
def cities_within_50(G) -> int:
G: NetworkX Graph object

num_cities: number of city pairs that are less than 50 miles apart

# this is a placeholder
# find the number of pairs of cities that are < 50 miles apart num_cities = 0 return num_cities #Implement your code and show the visualization of Question 4 def cities_within_100(G, city_list) -> nx.Graph:
G: NetworkX graph object
city_list: list of strings (names of cities in G)

S: subgraph of G that only contains edges between cities in “city_list” and directly neighboring cities that are less than 100 miles away

# This is a placeholder
S = nx.Graph()

# This cell is used to output your data. Do not modify it.

# 1.1 practice drawing a graph
practice_graphs()
plt.show()

# 1.2 practice using graph generators and find max eigenvavlue
G_cycle, G_clique, G_star = create_toy_graphs()

print(“leading eigenvalue for cycle graph”, calculate_leading_eigenvalue(G_cycle))
print(“leading eigenvalue for clique graph”, calculate_leading_eigenvalue(G_clique))
print(“leading eigenvalue for star graph”, calculate_leading_eigenvalue(G_star))

# 1.3 load city data and gather some data
G_cities = load_cities_data()
print(“number of nodes in cities_data:”, find_number_of_nodes(G_cities))
print(“number of edges in cities_data:”, find_number_of_edges(G_cities))
print(“number of city pairs within 50 miles:”, cities_within_50(G_cities))

# 1.4 find cities within 100 miles
S = cities_within_100(G_cities, [ “Toledo, OH”, “Stockton, CA”, “San Francisco, CA” ])
nx.draw(S, with_labels=True, pos=nx.spring_layout(S, k=3))
plt.show()

# ### Part 2 – Random Walks of Les Misérables [20 points]

# write a function to load lesmis_data.gml using built in NetworkX methods
def load_lesmis_data() -> nx.Graph:
G: NetworkX Graph object

# this is a placeholder, load the lesmis data into a graph, G

# Write a function that will return if the graph connected or unconnected.
def lesmis_connected(G) -> bool:
les_mis_connected (Boolean): whether the graph is connected or not

# These lines are placeholders. Use NetworkX to determine if the graph is connected.
les_mis_connected = True

return les_mis_connected

# calculate the shortest path length between each pair of nodes in the graph.
def calculate_shortest_paths(G) -> List:
G: NetworkX graph object

shortest_paths (list): list of shortest paths between each pair of nodes in the graph.

You may use any structure you’d like to store the shortest paths. The list return above is merely a suggestion.
It can be a list of path lengths, a dictionary with node pairs as keys and path lengths as values, etc.
# These lines are placeholders
# You may use any structure you’d like to store the shortest paths.
# It can be a list of path lengths, a dictionary with node pairs as keys and path lengths as values, etc.
shortest_paths = []

return shortest_paths

# plot the distribution of shortest paths using a histogram or barplot.
def plot_shortest_paths(G) -> None:
G: NetworkX graph object

# Create a plot of the distribution of shortest paths.
# you may use the calculate_shortest_paths fuction to help.
plt.show()

# find the average shortest path length of the graph.
def average_shortest_path(G) -> float:
G: NetworkX graph object

aspl (float): average shortest path length of the graph
# This line is a placeholder
# You may use the calculate_shortest_paths fuction to help.
aspl = 7280.00

return aspl

# find the maximum shortest path length of the graph.
def maximum_shortest_path(G) -> int:
G: NetworkX graph object

mspl (int): maximum shortest path length of the graph
# This line is a placeholder
# You may use the calculate_shortest_paths fuction to help.
mspl = 7280

return mspl

# Implement a function that will calculate the stationary distribution to find the 3 most commonly mentioned characters in the novel.
def popular_characters(G) -> List[str]:

G: NetworkX graph object

top_3: List[str]: a list of the string names of the 3 most commonly mentioned characters in the novel

# This line is a placeholder
top_3 = [“George”, “P.”, “Burdell”]

return top_3

# This cell is used to output your data. Do not modify it.

G_lesmis = load_lesmis_data()

print(“is this graph connected?”, lesmis_connected(G_lesmis))

print(“What is the average shortest path length?”, average_shortest_path(G_lesmis))
print(“What is the maximum shortest path length?”, maximum_shortest_path(G_lesmis))

print(“What 3 characters are most commonly mentioned characters?”, popular_characters(G_lesmis))

plot_shortest_paths(G_lesmis)

# ### Part 3 – Components of Drosophila Optic Medulla [20 points]

# write a function to load drosophila_medulla_data.graphml using built in NetworkX methods
def load_drosophila_medulla_data() -> nx.Graph:
G: NetworkX Graph object

# this is a placeholder, load the drosophila_medulla data into a graph, G

# Write a function to find the weakly connected components of this network, and return how many there are and what percentage of the nodes are in the largest weakly connected component.
def weakly_connected(G) -> Tuple[int, float]:
G: NetworkX graph object

wccs (int): number of weakly connected components in the graph
wpct (float): percent of nodes in the graph that belong to the largest weakly connected component

# These lines are placeholders
wccs = 7280
wpct = 0.7280

return wccs, wpct

# Write a function to find the strongly connected components of this network, and return how many there are and what percentage of the nodes are in the largest strongly connected component

def strongly_connected(G) -> Tuple[int, float]:
G: NetworkX graph object

sccs (int): number of strongly connected components in the graph
spct (float): percent of nodes in the graph that belong to the largest strongly connected component

#These lines are placeholders
sccs = 7280
spct = 0.7280

return sccs, spct

# Calculate the average shortest path length and maximum shortest path length of the largest strongly connected component

def scc_shortest_paths(G) -> List:
G: NetworkX graph object

shortest_paths (list): list of shortest paths between each pair of nodes in the graph.

# These lines are placeholders
# Remember to use Dijksta’s algorithm to calculate the shortest path length for this part.
# You may use any structure you like to store shortest paths.
shortest_paths = []

return shortest_paths

# plot the distribution of shortest paths for the scc using a histogram or barplot.
def plot_scc_shortest_paths(shortest_paths) -> None:
shortest_paths (list): list of shortest paths between each pair of nodes in the graph.

# Create a plot of the distribution of shortest paths.
# you may use the scc_shortest_paths fuction to help.
plt.show()

# find the average shortest path length of the scc.
def average_scc_shortest_path(shortest_paths) -> float:
shortest_paths (list): list of shortest paths between each pair of nodes in the graph.

aspl (float): average shortest path length of the graph
# This line is a placeholder
# You may use the calculate_shortest_paths fuction to help.
aspl = 7280.00

return aspl

# find the maximum shortest path length of the scc.
def maximum_scc_shortest_path(shortest_paths) -> int:
shortest_paths (list): list of shortest paths between each pair of nodes in the graph.

mspl (int): maximum shortest path length of the graph
# This line is a placeholder
# You may use the calculate_shortest_paths fuction to help.
mspl = 7280

return mspl

# This cell is used to output your data. Do not modify it.

G_dm = load_drosophila_medulla_data()

wccs, wpct = weakly_connected(G_dm)
print(“How many weakly connected components are there?”, wccs)
print(“What percent of nodes belong to the largest weakly connected component”, wpct)

sccs, spct = strongly_connected(G_dm)
print(“How many strongly connected components are there?”, sccs)
print(“What percent of nodes belong to the largest strongly connected component”, spct)

shortest_paths = scc_shortest_paths(G_dm)
print(“What is the average shortest path length of the SCC”, average_scc_shortest_path(shortest_paths))
print(“What is the maximum shortest path length of the SCC”, maximum_scc_shortest_path(shortest_paths))

plot_scc_shortest_paths(shortest_paths)

# ### Part 4 – Topologically Ordered Languages [20 points]

# write a function to load language_data.graphml using built in NetworkX methods
def load_language_data() -> nx.Graph:
G: NetworkX Graph object

# this is a placeholder, load the language data into a graph, G

# Write a function that will determine if a graph is a DAG or not

def is_graph_dag(G) -> bool:
G: NetworkX graph object

is_dag (Boolean): is the graph directed or not

# This is a placeholder
is_dag = True
is_dag = False

return is_dag

# Implement a function that will remove the first edge of each cycle

def remove_cycles(G) -> nx.Graph:
G: NetworkX graph object

G2: NetworkX graph object with cycles removed

#This is placeholder

# Write a function that returns the number of sources and the highest influencing node

def get_sources(G) -> Tuple[int, str]:
G: NetworkX graph object

num_sources (int): number of source nodes
top_source (str): the name of the node with the highest level of influence

#These are placeholders
num_sources = 7280
top_source = “George P. Burdell”

return num_sources, top_source

# This cell is used to output your data. Do not modify it.

G_lang = load_language_data()

print(“Is this graph a DAG?”, is_graph_dag(G_lang))

print(“Is this graph a DAG after modification?”, is_graph_dag(remove_cycles(G_lang)))

num_sources, top_source = get_sources(G_lang)
print(“There are {} sources in the graph, and the source with the highest influence is {}.”.format(num_sources, top_source))

# ### Part 5 – Bipartite Projects of Github

# Write a function that will read in the github text file and return it as a NetworkX graph

def load_github_data() -> Tuple[nx.Graph, List[str], List[str]]:
G: NetworkX graph object

G = nx.Graph()

# NOTE: We are also returning a list of users and projects. This will be helpful
# when getting the correct user and project indicies from the projections.
uid_list = []
pid_list = []

return G, uid_list, pid_list

# Create a function to create the User-Project Matrix

def calculate_projections(G, uid_list, pid_list) -> Tuple[sp.sparse.spmatrix, sp.sparse.spmatrix]:
G: NetworkX graph object

user_matrix: one mode projection for users
project_matrix: one mode projection for projects

users_projection = np.array([0,0])
projects_projection = np.array([0,0])

return users_projection, projects_projection

# Write a function that will return the pair of users that share the highest number of Github projects between them.

def get_user_pair(M, uid_list) -> Tuple[str, str]:
M: projected matrix

u1 (str) – first user
u2 (str) – second user

u1 = “George123”
u2 = “Burdell456”

return u1, u2

# Write a function that will return the pair of projects that share the highest number of users between them.
def get_project_pair(M, pid_list) -> Tuple[str, str]:
M: projected matrix

p1 (str) – first project
p2 (str) – second project

p1 = “George123”
p2 = “Burdell456”

return p1, p2

# This cell is used to output your data. Do not modify it.

G_gh, uid_list, pid_list = load_github_data()
user_matrix, project_matrix = calculate_projections(G_gh, uid_list, pid_list)
get_user_pair(user_matrix, uid_list)
get_project_pair(project_matrix, pid_list)