import pytest from knn import ( KMeansBinary, Point, calculate_centroid, classify_point_on_clusters, compare_clusters, vector_scalar_multiplication, vector_sum, ) # Test fixtures and helper functions @pytest.fixture def euclidean_distance(): def distance(p1: Point, p2: Point) -> float: return (sum((a - b) ** 2 for a, b in zip(p1, p2)))**0.5 return distance @pytest.fixture def manhattan_distance(): def distance(p1: Point, p2: Point) -> float: return sum(abs(a - b) for a, b in zip(p1, p2)) return distance def points_close(p1: Point, p2: Point, tolerance: float = 1e-6) -> bool: """Check if two points are approximately equal""" return all(abs(a - b) < tolerance for a, b in zip(p1, p2)) class TestVectorOperations: def test_vector_scalar_multiplication(self): vector = [1.0, 2.0, 3.0] result = vector_scalar_multiplication(2, vector) assert result == [2.0, 4.0, 6.0] # Test with zero scalar result = vector_scalar_multiplication(0, vector) assert result == [0.0, 0.0, 0.0] # Test with negative scalar result = vector_scalar_multiplication(-1.5, vector) assert result == [-1.5, -3.0, -4.5] def test_vector_sum(self): a = [1.0, 2.0, 3.0] b = [4.0, 5.0, 6.0] result = vector_sum(a, b) assert result == [5.0, 7.0, 9.0] # Test with zeros zero = [0.0, 0.0, 0.0] result = vector_sum(a, zero) assert result == a def test_vector_sum_different_lengths(self): a = [1.0, 2.0] b = [3.0, 4.0, 5.0] with pytest.raises(AssertionError): vector_sum(a, b) class TestCentroid: def test_calculate_centroid_simple(self): points = [[0.0, 0.0], [2.0, 0.0], [1.0, 2.0]] centroid = calculate_centroid(points) expected = [1.0, 2.0 / 3.0] assert points_close(centroid, expected) def test_calculate_centroid_single_point(self): points = [[3.0, 4.0]] centroid = calculate_centroid(points) assert points_close(centroid, [3.0, 4.0]) def test_calculate_centroid_empty_list(self): points = [] centroid = calculate_centroid(points) assert centroid == [] def test_calculate_centroid_identical_points(self): points = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]] centroid = calculate_centroid(points) assert points_close(centroid, [1.0, 1.0]) class TestClassification: def test_classify_point_on_clusters(self, euclidean_distance): point = [1.0, 1.0] centers = ([0.0, 0.0], [3.0, 3.0]) # Point should be closer to first center result = classify_point_on_clusters(point, centers, euclidean_distance) assert result == True # Test point closer to second center point = [2.5, 2.5] result = classify_point_on_clusters(point, centers, euclidean_distance) assert result == False def test_classify_point_equidistant(self, euclidean_distance): point = [1.5, 1.5] centers = ([0.0, 0.0], [3.0, 3.0]) # Point is equidistant, should return False (>=) result = classify_point_on_clusters(point, centers, euclidean_distance) assert result == False class TestClusterComparison: def test_compare_clusters_identical(self): cluster_a = [[1.0, 2.0], [3.0, 4.0]] cluster_b = [[1.0, 2.0], [3.0, 4.0]] assert compare_clusters(cluster_a, cluster_b) == True def test_compare_clusters_different_order(self): cluster_a = [[1.0, 2.0], [3.0, 4.0]] cluster_b = [[3.0, 4.0], [1.0, 2.0]] assert compare_clusters(cluster_a, cluster_b) == True def test_compare_clusters_different(self): cluster_a = [[1.0, 2.0], [3.0, 4.0]] cluster_b = [[1.0, 2.0], [5.0, 6.0]] assert compare_clusters(cluster_a, cluster_b) == False def test_compare_clusters_empty(self): assert compare_clusters([], []) == True assert compare_clusters([[1.0, 2.0]], []) == False class TestKMeansBinary: def test_kmeans_binary_simple_case(self, euclidean_distance): # Two well-separated clusters points = [ [0.0, 0.0], [0.1, 0.1], [0.2, 0.0], # Cluster 1 [5.0, 5.0], [5.1, 5.1], [5.0, 5.2], # Cluster 2 ] centroid1, centroid2 = KMeansBinary(points, euclidean_distance) # Check that centroids are reasonable assert len(centroid1) == 2 assert len(centroid2) == 2 # One centroid should be near (0.1, 0.033), other near (5.033, 5.1) c1_near_origin = abs(centroid1[0]) < 1 and abs(centroid1[1]) < 1 c2_near_origin = abs(centroid2[0]) < 1 and abs(centroid2[1]) < 1 # Exactly one should be near origin assert c1_near_origin != c2_near_origin def test_kmeans_binary_single_point(self, euclidean_distance): points = [[1.0, 1.0]] centroid1, centroid2 = KMeansBinary(points, euclidean_distance) # Both centroids should be the single point assert points_close(centroid1, [1.0, 1.0]) assert points_close(centroid2, []) # Empty cluster def test_kmeans_binary_two_points(self, euclidean_distance): points = [[0.0, 0.0], [2.0, 2.0]] centroid1, centroid2 = KMeansBinary(points, euclidean_distance) # Should converge to the two original points centroids = [centroid1, centroid2] assert any(points_close(c, [0.0, 0.0]) for c in centroids) assert any(points_close(c, [2.0, 2.0]) for c in centroids) def test_kmeans_binary_convergence(self, euclidean_distance): # Test that algorithm converges within reasonable iterations points = [ [i / 10.0, i / 10.0] for i in range(5) # Points along diagonal ] + [ [i / 10.0 + 5, i / 10.0 + 5] for i in range(5) # Shifted cluster ] centroid1, centroid2 = KMeansBinary( points, euclidean_distance, max_iterations=50 ) # Should produce two distinct clusters distance_between_centroids = euclidean_distance(centroid1, centroid2) assert distance_between_centroids > 2.0 # Should be well separated def test_kmeans_binary_with_manhattan_distance(self, manhattan_distance): points = [[0, 0], [1, 0], [0, 1], [10, 10], [11, 10], [10, 11]] centroid1, centroid2 = KMeansBinary(points, manhattan_distance) # Should separate into two clusters assert len(centroid1) == 2 assert len(centroid2) == 2 # One centroid should be near origin, other near (10,10) c1_near_origin = abs(centroid1[0]) < 5 and abs(centroid1[1]) < 5 c2_near_origin = abs(centroid2[0]) < 5 and abs(centroid2[1]) < 5 assert c1_near_origin != c2_near_origin def test_kmeans_binary_max_iterations(self, euclidean_distance): # Test that max_iterations parameter works points = [[i, i] for i in range(10)] # Should work with very few iterations centroid1, centroid2 = KMeansBinary( points, euclidean_distance, max_iterations=1 ) assert len(centroid1) == 2 assert len(centroid2) == 2 def test_kmeans_binary_3d_points(self, euclidean_distance): # Test with 3D points points = [ [0, 0, 0], [1, 0, 0], [0, 1, 0], # Near origin [5, 5, 5], [6, 5, 5], [5, 6, 5], # Far from origin ] centroid1, centroid2 = KMeansBinary(points, euclidean_distance) assert len(centroid1) == 3 assert len(centroid2) == 3 # Should separate the two groups c1_near_origin = all(abs(x) < 3 for x in centroid1) c2_near_origin = all(abs(x) < 3 for x in centroid2) assert c1_near_origin != c2_near_origin class TestEdgeCases: def test_identical_points(self, euclidean_distance): # All points are identical points = [[1.0, 1.0]] * 5 centroid1, centroid2 = KMeansBinary(points, euclidean_distance) # Both centroids should be the same point assert points_close(centroid1, [1.0, 1.0]) assert points_close(centroid2, [1.0, 1.0]) def test_collinear_points(self, euclidean_distance): # Points on a line points = [[i, 0] for i in range(6)] centroid1, centroid2 = KMeansBinary(points, euclidean_distance) # Should still produce two clusters assert len(centroid1) == 2 assert len(centroid2) == 2 assert centroid1[1] == 0 # y-coordinate should be 0 assert centroid2[1] == 0 # y-coordinate should be 0 if __name__ == "__main__": pytest.main([__file__])