Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions Summer-2020-Data-Analysis-Project-Brandon-Branch/KMeansClustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
# =============================================================================
# KMeansClustering.py
# Name: Alycia Wong and Brandon Wong
# Date: June 2020
# Description: Process and graph a CSV file containing biomedical data that
# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD).
# Randomly generate up to 10 centroids without issue. Each centroid will have a
# classification. The nearest centroid to a point will determine the point's
# classicfication (decide what to do if the distances are equal yourself).
# Create random test cases until centroids stop mocing and determine whether
# each case is likely to have CKD depending on the classification of the
# nearest centroid.
# Bonus: Create lines roughly separating each centroid group
# =============================================================================

# =============================================================================
# Import statements
# =============================================================================
import matplotlib.pyplot as plt
import numpy as np
import NearestNeighborClassifier as NNC
from scipy.spatial import KDTree as kdt

# =============================================================================
# Functions
# =============================================================================
# randomCentroids function takes in an integer number of clusters to be
# generated.
# OR asks for k number of integer clusters
# Outputs a 2D array filled with random values between 0-1. The
# first column represents glucose and the second column represents hemoglobin.
# There are k number of rows representing the number of centroids and the
# classification of each centroid (i.e.: row index = classification value).
# OR you can have a third column with the classification value.
def randomCentroids(k):
return np.random.rand(k,2)

# assignCentroids function takes in an array of normalized x (hemoglobin) and y
# (glucose) values from the CSV file and the randomly generated array of
# centroids from randomCentroids. Using the findDistance function from
# NearestNeighborClassifier, points are assigned the same classification as the
# nearest centroid. A 2D array of the normalized data and its classification
# are returned.
def assignToCentroids(normArr, centArr):
return kdt(centArr).query(normArr)[1]
# print(assignToCentroids(NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, np.array([[.5, .5],[.25,.25]])))

# updateCentroids function inputs the 2D array of centroid locations and of
# classified and normalized CSV data. The average x (hemo) and y (gluc)
# positions of all data points for each classifications are found and an
# updated 2D array with these average cartesian points as the location for the
# new centroids is returned along with the original cartesian points.
#avg of all 1s will be new cent, avg of all 0s will be new cent

def updateCentroids(centArr, classArr, normArr):
upCentArr = centArr.copy()
for i in range(len(centArr[:,0])):
upCentArr[i,0] = np.mean(normArr.gluc[classArr==i])
upCentArr[i,1] = np.mean(normArr.hemo[classArr==i])
return upCentArr
# centArr = np.array([[0.5, 0.5], [.25, .25]])
# print(updateCentroids(
# centArr, assignToCentroids(
# NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, centArr),
# NNC.normalizeData(NNC.openCSVFile('ckd.csv'))
# ))
# print(centArr)

# iterate void function can either
# a) input information and iterate the original information until centArr ~
# upCentArr
def iterate(normArr, centArr):
# classArr = np.zeros(len(normArr.gluc))
classArr = assignToCentroids(normArr, centArr)
upCentArr = updateCentroids(centArr, classArr, normArr)
# print(classArr)
if (upCentArr != centArr).any():
centArr = upCentArr
return iterate(normArr, centArr)
return centArr
print(iterate(
NNC.normalizeData(NNC.openCSVFile('ckd.csv')), np.array([[.5, .5],[.25,.25]])
))

# graphClusters void function takes in a 1D and a 2D numpy array to graph. The
# 1D array of centroid locations and classifactions have distinct points on the
# graph. The 2D array graphs points of normalized CSV data and colors them the
# same color as their corresponding centroids. A legend is generated in a
# reasonable position.
# Bonus: Create lines roughly separating each centroid group
def graphClusters():

return

# dataAnalysis void function takes in the original parsed CSV classifications
# and the final classifications of the data based on K-means clustering (use of
# centroids) and compares the two to find false/true positives/negatives.
# Note: This should only run when there are two centroids (i.e.: k = 2)
# False positive: Percentage of non-CKD were incorrectly labelled by K-Means as
# being in the CKD cluster
# True positive (sensitivity): Percentage of CKD patients were correctly
# labeled by K-Means
# False negative: Percentage of non-CKD were incorrectly labelled by K-Means as
# being in the CKD cluster
# True negative (specificity): Percentage of non-CKD patients were correctly
# labelled by K-Means
# Note: True positive (~93 %) + False positive (~7%) = 100%
# Note: True Negative (~100%) + False negative (~0%) = 100%
def dataAnalysis():
return
# =============================================================================
# Main Script
# =============================================================================
# mainDriver function takes in nothing and graphs both the orginial CSV file,
# the k number of nearest neighbors, and the test case. This function returns
# 0.
def mainDriver():
# Open the CSV file using the parsing method from
# NearestNeighborClassifier. No input, outputs 2D numpy array.
NNC.openCSVFile

# Normalize data using method from NearestNeighborClassifier. Input and
# outputs a 2D numpy array
NNC.normalizeData()

# Graph CSV file using method from NearestNeighborClassifier. Input 2D
# numpy array. Void function.
NNC.graphCSVFile()

return 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# =============================================================================
# KNearestNeighborClassifier.py
# Name: Alycia Wong and Brandon Wong
# Date: June 2020
# Description: Process and graph a CSV file containing biomedical data that
# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD).
# Create a random test case and determine whether the case is
# likely to have CKD depending on the mode of the classifications of the
# k number of nearest points.
# =============================================================================

# =============================================================================
# Import statements
# =============================================================================
import matplotlib.pyplot as plt
import numpy as np
import NearestNeighborClassifier as NNC
from statistics import mode

# =============================================================================
# Functions
# =============================================================================
# findDistanceArray inputs a numpy array, a random point, and an integer k and
# uses the findDistance function from NearestNeighborClassifier. The function
# outputs a 1D array containing the k number of nearst points to the random
# test case.
def findDistanceArray(normArr, testCase, k):
distArr = np.zeros(normArr.len)
for i in range(len(distArr)):
distArr[i] = NNC.findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0])
kindex = np.argsort(distArr)[:k]
return kindex

# graphKNearestNeighbor void function takes in two 1D and one 2D numpy arrays
# to graph. One of the 1D arrays is a random testCase with its own distinct
# points. The other 1D array is used to circle the k number of points closest
# to the test case. The 2D array contains information parsed from the CSV
# column. The first column (hemoglobin) is graphed as the x-axis and the second
# column (glucose) as the y-axis. The third column (classification) determines
# the color of the points. A legend is generated in a reasonable position.
def graphKNearestNeighbor(testCase, normArr, k):
kindex = findDistanceArray(normArr, testCase, k)
NNC.graphCSVFile(normArr)
plt.scatter(testCase[1], testCase[0],
c = ('b' if mode(normArr.disease[kindex])==0 else 'r'),
label = 'Test Case',
marker = "x")
plt.scatter(normArr.hemo[kindex], normArr.gluc[kindex],
c='y', label = 'Nearest neighbor(s)')
print("butts")
plt.legend(fontsize="small")
plt.show()
return

# =============================================================================
# Main Script
# =============================================================================
# mainDriver function takes in nothing and graphs both the orginial CSV file,
# the k number of nearest neighbors, and the test case. This function returns
# 0.5
def mainDriver():
val = int(input("How many neighbors are you looking for: "))
test = NNC.createTestCase()
normal = NNC.normalizeData(NNC.openCSVFile('ckd.csv'))
graphKNearestNeighbor(test, normal, val)
return 0
mainDriver()
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# =============================================================================
# NearestNeighborClassifier.py
# Name: Alycia Wong and Brandon Wong
# Date: June 2020
# Description: Process and graph a CSV file containing biomedical data that
# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD).
# Create n number of random test cases and determine whether the case is
# likely to have CKD depending on the classification of the nearest point.
# =============================================================================

# =============================================================================
# Import statements
# =============================================================================
import matplotlib.pyplot as plt
import numpy as np

# =============================================================================
# Classes
# =============================================================================
class Butts:
def __init__(self, data):
self.gluc = data[:,0]
self.hemo = data[:,1]
self.disease = data[:,2]
self.len = len(data)
self.all = data[:,:3]
self.paras = data[:,:2]
self.shape = np.shape(data)
self.colmax = np.amax(data, axis = 0)
self.colmin = np.amin(data, axis = 0)

# =============================================================================
# Functions
# =============================================================================
# Parses in file and turns it into Butts class of data
def openCSVFile(fileName):
return Butts(np.genfromtxt(fileName, delimiter=',',skip_header=1))

# Takes in butts class
# Loops over data normalizing it for every row
# returns normalized butts class data
def normalizeData(dataArr):
normArr = np.zeros(dataArr.shape)
for i in range(len(normArr)):
normArr[i] = (dataArr.all[i] - dataArr.colmin) / (dataArr.colmax - dataArr.colmin)
return Butts(normArr)

# graphCSVFile void function takes in a 2D numpy array and graphs with the
# first column (hemoglobin) as the x-axis and second column (glucose) as the
# y-axis. The third column (classification) is used to determine the color of
# the points on the graph.
def graphCSVFile(normArr):
plt.scatter(normArr.hemo[normArr.disease==0], normArr.gluc[normArr.disease==0],
c='b', label='No CKD' )
plt.scatter(normArr.hemo[normArr.disease==1], normArr.gluc[normArr.disease==1],
c='r', label='CKD')
plt.title('Hemoglobin and Glucose levels')
plt.xlabel('Hemoglobin')
plt.ylabel('Glucose')
return
# findDistance function is either:
# a) takes in an array and a point and returns an array of distances or the
# minimum distance or
# B) takes in cartesian coordinates and uses a simple use of the distance
# formula to return the distance between the two points.
def findDistance(x1, y1, x2, y2):
return np.sqrt((x1-x2)**2+(y1-y2)**2)

# createTestCase function creates two random test cases (hemoglobin and
# glucose) from 0-1 and:
# creates a new 1D array with the two points
# return the points raw
def createTestCase():
return np.random.rand(2)

# nearestNeighborIndex takes in the test case point and returns the index of the
# nearest point to the test case
def nearestNeighborIndex(testCase, normArr):
distArr = np.zeros(normArr.len)
for i in range(len(distArr)):
distArr[i] = findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0])
nni = distArr.argmin()
return nni

# graphNearestNeighbor void function takes in a 2D numpy array (and a cartesian
# coordinate depending on createTestCase) and graphs the first column
# (hemoglobin) as the x-axis and the second column (glucose) as the y-axis
# the third column (classification) determines the color of the points. A
# randomly generated test case is graphed as a distinct point with a
# line connecting it to the nearest neighbor whose classification it takes on.
# A legend is generated in a reasonable position.
def graphNearestNeighbor(testCase, normArr):
nni = nearestNeighborIndex(testCase, normArr)
graphCSVFile(normArr)
plt.scatter(testCase[1], testCase[0],
c = ('b' if normArr.disease[nni]==0 else 'r'),
label = 'Test Case',
marker = "x")
plt.plot([testCase[1], normArr.hemo[nni]], [testCase[0], normArr.gluc[nni]], 'k-')
plt.legend()
plt.show()
return

# =============================================================================
# Main Script
# =============================================================================
# mainDriver function takes in no inputs and graphs both the orginial CSV
# file and the test case. This function returns 0.
def mainDriver():
graphNearestNeighbor(createTestCase(), normalizeData(openCSVFile('ckd.csv')))
return 0
# mainDriver()
Loading