LeashQuiche · LeashQuiche · Sep 5, 2020 · Sep 6, 2020
diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/KMeansClustering.py b/Summer-2020-Data-Analysis-Project-Brandon-Branch/KMeansClustering.py
@@ -0,0 +1,130 @@
+# =============================================================================
+# KMeansClustering.py
+# Name: Alycia Wong and Brandon Wong
+# Date: June 2020
+# Description: Process and graph a CSV file containing biomedical data that 
+# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD).
+# Randomly generate up to 10 centroids without issue. Each centroid will have a
+# classification. The nearest centroid to a point will determine the point's 
+# classicfication (decide what to do if the distances are equal yourself).
+# Create random test cases until centroids stop mocing and determine whether 
+# each case is likely to have CKD depending on the classification of the
+# nearest centroid.
+# Bonus: Create lines roughly separating each centroid group
+# =============================================================================
+
+# =============================================================================
+# Import statements
+# =============================================================================
+import matplotlib.pyplot as plt
+import numpy as np
+import NearestNeighborClassifier as NNC
+from scipy.spatial import KDTree as kdt
+
+# =============================================================================
+# Functions
+# =============================================================================
+# randomCentroids function takes in an integer number of clusters to be
+# generated. 
+# OR asks for k number of integer clusters
+# Outputs a 2D array filled with random values between 0-1. The 
+# first column represents glucose and the second column represents hemoglobin.
+# There are k number of rows representing the number of centroids and the
+# classification of each centroid (i.e.: row index = classification value).
+# OR you can have a third column with the classification value.
+def randomCentroids(k):
+    return np.random.rand(k,2)
+
+# assignCentroids function takes in an array of normalized x (hemoglobin) and y 
+# (glucose) values from the CSV file and the randomly generated array of 
+# centroids from randomCentroids. Using the findDistance function from 
+# NearestNeighborClassifier, points are assigned the same classification as the 
+# nearest centroid. A 2D array of the normalized data and its classification 
+# are returned.
+def assignToCentroids(normArr, centArr):
+    return kdt(centArr).query(normArr)[1]
+# print(assignToCentroids(NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, np.array([[.5, .5],[.25,.25]])))
+
+# updateCentroids function inputs the 2D array of centroid locations and of 
+# classified and normalized CSV data. The average x (hemo) and y (gluc) 
+# positions of all data points for each classifications are found and an 
+# updated 2D array with these average cartesian points as the location for the
+# new centroids is returned along with the original cartesian points. 
+#avg of all 1s will be new cent, avg of all 0s will be new cent
+
+def updateCentroids(centArr, classArr, normArr):
+    upCentArr = centArr.copy()
+    for i in range(len(centArr[:,0])):
+        upCentArr[i,0] = np.mean(normArr.gluc[classArr==i])
+        upCentArr[i,1] = np.mean(normArr.hemo[classArr==i])
+    return upCentArr
+# centArr = np.array([[0.5, 0.5], [.25, .25]])
+# print(updateCentroids(
+#     centArr, assignToCentroids(
+#         NNC.normalizeData(NNC.openCSVFile('ckd.csv')).paras, centArr),
+#         NNC.normalizeData(NNC.openCSVFile('ckd.csv'))
+#     ))
+# print(centArr)
+
+# iterate void function can either
+# a) input information and iterate the original information until centArr ~ 
+#     upCentArr
+def iterate(normArr, centArr):
+    # classArr = np.zeros(len(normArr.gluc))
+    classArr = assignToCentroids(normArr, centArr)
+    upCentArr = updateCentroids(centArr, classArr, normArr)
+    # print(classArr)
+    if (upCentArr != centArr).any():
+        centArr = upCentArr
+        return iterate(normArr, centArr)
+    return centArr
+print(iterate(
+    NNC.normalizeData(NNC.openCSVFile('ckd.csv')), np.array([[.5, .5],[.25,.25]])
+    ))
+
+# graphClusters void function takes in a 1D and a 2D numpy array to graph. The
+# 1D array of centroid locations and classifactions have distinct points on the 
+# graph. The 2D array graphs points of normalized CSV data and colors them the
+# same color as their corresponding centroids. A legend is generated in a
+# reasonable position.
+# Bonus: Create lines roughly separating each centroid group
+def graphClusters():
+
+    return
+
+# dataAnalysis void function takes in the original parsed CSV classifications 
+# and the final classifications of the data based on K-means clustering (use of
+# centroids) and compares the two to find false/true positives/negatives.
+# Note: This should only run when there are two centroids (i.e.: k = 2)
+# False positive: Percentage of non-CKD were incorrectly labelled by K-Means as
+# being in the CKD cluster
+# True positive (sensitivity): Percentage of CKD patients were correctly 
+# labeled by K-Means 
+# False negative: Percentage of non-CKD were incorrectly labelled by K-Means as
+# being in the CKD cluster
+# True negative (specificity): Percentage of non-CKD patients were correctly 
+# labelled by K-Means 
+# Note: True positive (~93 %) + False positive (~7%) = 100%
+# Note: True Negative (~100%) + False negative (~0%) = 100%
+def dataAnalysis():
+    return 
+# =============================================================================
+# Main Script
+# =============================================================================
+# mainDriver function takes in nothing and graphs both the orginial CSV file,
+# the k number of nearest neighbors, and the test case. This function returns 
+# 0.
+def mainDriver():
+    # Open the CSV file using the parsing method from 
+    # NearestNeighborClassifier. No input, outputs 2D numpy array.
+    NNC.openCSVFile
+
+    # Normalize data using method from NearestNeighborClassifier. Input and
+    # outputs a 2D numpy array
+    NNC.normalizeData()
+
+    # Graph CSV file using method from NearestNeighborClassifier. Input 2D 
+    # numpy array. Void function.
+    NNC.graphCSVFile()
+
+    return 0
diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/KNearestNeighborClassifier.py b/Summer-2020-Data-Analysis-Project-Brandon-Branch/KNearestNeighborClassifier.py
@@ -0,0 +1,67 @@
+# =============================================================================
+# KNearestNeighborClassifier.py
+# Name: Alycia Wong and Brandon Wong
+# Date: June 2020
+# Description: Process and graph a CSV file containing biomedical data that 
+# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD).
+# Create a random test case and determine whether the case is
+# likely to have CKD depending on the mode of the classifications of the
+# k number of nearest points.
+# =============================================================================
+
+# =============================================================================
+# Import statements
+# =============================================================================
+import matplotlib.pyplot as plt
+import numpy as np
+import NearestNeighborClassifier as NNC
+from statistics import mode
+
+# =============================================================================
+# Functions
+# =============================================================================
+# findDistanceArray inputs a numpy array, a random point, and an integer k and
+# uses the findDistance function from NearestNeighborClassifier. The function
+# outputs a 1D array containing the k number of nearst points to the random
+# test case.
+def findDistanceArray(normArr, testCase, k):
+    distArr = np.zeros(normArr.len)
+    for i in range(len(distArr)):
+        distArr[i] = NNC.findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0])
+        kindex = np.argsort(distArr)[:k]
+    return kindex
+
+# graphKNearestNeighbor void function takes in two 1D and one 2D numpy arrays
+# to graph. One of the 1D arrays is a random testCase with its own distinct
+# points. The other 1D array is used to circle the k number of points closest 
+# to the test case. The 2D array contains information parsed from the CSV 
+# column. The first column (hemoglobin) is graphed as the x-axis and the second
+# column (glucose) as the y-axis. The third column  (classification) determines
+# the color of the points. A legend is generated in a reasonable position.
+def graphKNearestNeighbor(testCase, normArr, k):
+    kindex = findDistanceArray(normArr, testCase, k)
+    NNC.graphCSVFile(normArr)
+    plt.scatter(testCase[1], testCase[0],
+                c = ('b' if mode(normArr.disease[kindex])==0 else 'r'),
+                label = 'Test Case',
+                marker = "x")
+    plt.scatter(normArr.hemo[kindex], normArr.gluc[kindex],
+                c='y', label = 'Nearest neighbor(s)')
+    print("butts")
+    plt.legend(fontsize="small")
+    plt.show()
+    return
+
+# =============================================================================
+# Main Script
+# =============================================================================
+# mainDriver function takes in nothing and graphs both the orginial CSV file,
+# the k number of nearest neighbors, and the test case. This function returns 
+# 0.5
+def mainDriver():
+    val = int(input("How many neighbors are you looking for: "))
+    test = NNC.createTestCase()
+    normal = NNC.normalizeData(NNC.openCSVFile('ckd.csv'))
+    graphKNearestNeighbor(test, normal, val)
+    return 0
+mainDriver()
diff --git a/Summer-2020-Data-Analysis-Project-Brandon-Branch/NearestNeighborClassifier.py b/Summer-2020-Data-Analysis-Project-Brandon-Branch/NearestNeighborClassifier.py
@@ -0,0 +1,112 @@
+# =============================================================================
+# NearestNeighborClassifier.py
+# Name: Alycia Wong and Brandon Wong
+# Date: June 2020
+# Description: Process and graph a CSV file containing biomedical data that 
+# relates hemoglobin levels, glucose levels, and chronic kidney disease (CKD).
+# Create n number of random test cases and determine whether the case is
+# likely to have CKD depending on the classification of the nearest point.
+# =============================================================================
+
+# =============================================================================
+# Import statements
+# =============================================================================
+import matplotlib.pyplot as plt
+import numpy as np
+
+# =============================================================================
+# Classes
+# =============================================================================
+class Butts:
+    def __init__(self, data):
+        self.gluc = data[:,0]
+        self.hemo = data[:,1]
+        self.disease = data[:,2]
+        self.len = len(data)
+        self.all = data[:,:3]
+        self.paras = data[:,:2]
+        self.shape = np.shape(data)
+        self.colmax = np.amax(data, axis = 0)
+        self.colmin = np.amin(data, axis = 0)
+
+# =============================================================================
+# Functions
+# =============================================================================
+# Parses in file and turns it into Butts class of data
+def openCSVFile(fileName):
+    return Butts(np.genfromtxt(fileName, delimiter=',',skip_header=1))
+
+# Takes in butts class
+# Loops over data normalizing it for every row
+# returns normalized butts class data
+def normalizeData(dataArr):
+    normArr = np.zeros(dataArr.shape)
+    for i in range(len(normArr)):
+        normArr[i] = (dataArr.all[i] - dataArr.colmin) / (dataArr.colmax - dataArr.colmin)
+    return Butts(normArr)
+
+# graphCSVFile void function takes in a 2D numpy array and graphs with the
+# first column (hemoglobin) as the x-axis and second column (glucose) as the 
+# y-axis. The third column (classification) is used to determine the color of
+# the points on the graph.
+def graphCSVFile(normArr):
+    plt.scatter(normArr.hemo[normArr.disease==0], normArr.gluc[normArr.disease==0],
+                c='b', label='No CKD' )
+    plt.scatter(normArr.hemo[normArr.disease==1], normArr.gluc[normArr.disease==1],
+                c='r', label='CKD')
+    plt.title('Hemoglobin and Glucose levels')
+    plt.xlabel('Hemoglobin')
+    plt.ylabel('Glucose')
+    return
+# findDistance function is either:
+# a) takes in an array and a point and returns an array of distances or the
+# minimum distance or
+# B) takes in cartesian coordinates and uses a simple use of the distance
+# formula to return the distance between the two points.
+def findDistance(x1, y1, x2, y2):
+    return np.sqrt((x1-x2)**2+(y1-y2)**2)
+
+# createTestCase function creates two random test cases (hemoglobin and 
+# glucose) from 0-1 and: 
+# creates a new 1D array with the two points
+# return the points raw
+def createTestCase():
+    return np.random.rand(2)
+
+# nearestNeighborIndex takes in the test case point and returns the index of the
+# nearest point to the test case
+def nearestNeighborIndex(testCase, normArr):
+    distArr = np.zeros(normArr.len)
+    for i in range(len(distArr)):
+        distArr[i] = findDistance(normArr.hemo[i], normArr.gluc[i], testCase[1], testCase[0])
+    nni = distArr.argmin()
+    return nni
+
+# graphNearestNeighbor void function takes in a 2D numpy array (and a cartesian 
+# coordinate depending on createTestCase) and graphs the first column 
+# (hemoglobin) as the x-axis and the second column (glucose) as the y-axis
+# the third column (classification) determines the color of the points. A 
+# randomly generated test case is graphed as a distinct point with a 
+# line connecting it to the nearest neighbor whose classification it takes on.
+# A legend is generated in a reasonable position.
+def graphNearestNeighbor(testCase, normArr):
+    nni = nearestNeighborIndex(testCase, normArr)
+    graphCSVFile(normArr)
+    plt.scatter(testCase[1], testCase[0],
+                c = ('b' if normArr.disease[nni]==0 else 'r'),
+                label = 'Test Case',
+                marker = "x")
+    plt.plot([testCase[1], normArr.hemo[nni]], [testCase[0], normArr.gluc[nni]], 'k-')
+    plt.legend()
+    plt.show()
+    return
+
+# =============================================================================
+# Main Script
+# =============================================================================
+# mainDriver function takes in no inputs and graphs both the orginial CSV
+# file and the test case. This function returns 0.
+def mainDriver():
+    graphNearestNeighbor(createTestCase(), normalizeData(openCSVFile('ckd.csv')))
+    return 0
+# mainDriver()