268 lines
8.7 KiB
Python
268 lines
8.7 KiB
Python
import pytest
|
|
|
|
from knn import (
|
|
KMeansBinary,
|
|
Point,
|
|
calculate_centroid,
|
|
classify_point_on_clusters,
|
|
compare_clusters,
|
|
vector_scalar_multiplication,
|
|
vector_sum,
|
|
)
|
|
|
|
|
|
# Test fixtures and helper functions
|
|
@pytest.fixture
|
|
def euclidean_distance():
|
|
def distance(p1: Point, p2: Point) -> float:
|
|
return (sum((a - b) ** 2 for a, b in zip(p1, p2)))**0.5
|
|
|
|
return distance
|
|
|
|
|
|
@pytest.fixture
|
|
def manhattan_distance():
|
|
def distance(p1: Point, p2: Point) -> float:
|
|
return sum(abs(a - b) for a, b in zip(p1, p2))
|
|
|
|
return distance
|
|
|
|
|
|
def points_close(p1: Point, p2: Point, tolerance: float = 1e-6) -> bool:
|
|
"""Check if two points are approximately equal"""
|
|
return all(abs(a - b) < tolerance for a, b in zip(p1, p2))
|
|
|
|
|
|
class TestVectorOperations:
|
|
def test_vector_scalar_multiplication(self):
|
|
vector = [1.0, 2.0, 3.0]
|
|
result = vector_scalar_multiplication(2, vector)
|
|
assert result == [2.0, 4.0, 6.0]
|
|
|
|
# Test with zero scalar
|
|
result = vector_scalar_multiplication(0, vector)
|
|
assert result == [0.0, 0.0, 0.0]
|
|
|
|
# Test with negative scalar
|
|
result = vector_scalar_multiplication(-1.5, vector)
|
|
assert result == [-1.5, -3.0, -4.5]
|
|
|
|
def test_vector_sum(self):
|
|
a = [1.0, 2.0, 3.0]
|
|
b = [4.0, 5.0, 6.0]
|
|
result = vector_sum(a, b)
|
|
assert result == [5.0, 7.0, 9.0]
|
|
|
|
# Test with zeros
|
|
zero = [0.0, 0.0, 0.0]
|
|
result = vector_sum(a, zero)
|
|
assert result == a
|
|
|
|
def test_vector_sum_different_lengths(self):
|
|
a = [1.0, 2.0]
|
|
b = [3.0, 4.0, 5.0]
|
|
with pytest.raises(AssertionError):
|
|
vector_sum(a, b)
|
|
|
|
|
|
class TestCentroid:
|
|
def test_calculate_centroid_simple(self):
|
|
points = [[0.0, 0.0], [2.0, 0.0], [1.0, 2.0]]
|
|
centroid = calculate_centroid(points)
|
|
expected = [1.0, 2.0 / 3.0]
|
|
assert points_close(centroid, expected)
|
|
|
|
def test_calculate_centroid_single_point(self):
|
|
points = [[3.0, 4.0]]
|
|
centroid = calculate_centroid(points)
|
|
assert points_close(centroid, [3.0, 4.0])
|
|
|
|
def test_calculate_centroid_empty_list(self):
|
|
points = []
|
|
centroid = calculate_centroid(points)
|
|
assert centroid == []
|
|
|
|
def test_calculate_centroid_identical_points(self):
|
|
points = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]
|
|
centroid = calculate_centroid(points)
|
|
assert points_close(centroid, [1.0, 1.0])
|
|
|
|
|
|
class TestClassification:
|
|
def test_classify_point_on_clusters(self, euclidean_distance):
|
|
point = [1.0, 1.0]
|
|
centers = ([0.0, 0.0], [3.0, 3.0])
|
|
|
|
# Point should be closer to first center
|
|
result = classify_point_on_clusters(point, centers, euclidean_distance)
|
|
assert result == True
|
|
|
|
# Test point closer to second center
|
|
point = [2.5, 2.5]
|
|
result = classify_point_on_clusters(point, centers, euclidean_distance)
|
|
assert result == False
|
|
|
|
def test_classify_point_equidistant(self, euclidean_distance):
|
|
point = [1.5, 1.5]
|
|
centers = ([0.0, 0.0], [3.0, 3.0])
|
|
|
|
# Point is equidistant, should return False (>=)
|
|
result = classify_point_on_clusters(point, centers, euclidean_distance)
|
|
assert result == False
|
|
|
|
|
|
class TestClusterComparison:
|
|
def test_compare_clusters_identical(self):
|
|
cluster_a = [[1.0, 2.0], [3.0, 4.0]]
|
|
cluster_b = [[1.0, 2.0], [3.0, 4.0]]
|
|
assert compare_clusters(cluster_a, cluster_b) == True
|
|
|
|
def test_compare_clusters_different_order(self):
|
|
cluster_a = [[1.0, 2.0], [3.0, 4.0]]
|
|
cluster_b = [[3.0, 4.0], [1.0, 2.0]]
|
|
assert compare_clusters(cluster_a, cluster_b) == True
|
|
|
|
def test_compare_clusters_different(self):
|
|
cluster_a = [[1.0, 2.0], [3.0, 4.0]]
|
|
cluster_b = [[1.0, 2.0], [5.0, 6.0]]
|
|
assert compare_clusters(cluster_a, cluster_b) == False
|
|
|
|
def test_compare_clusters_empty(self):
|
|
assert compare_clusters([], []) == True
|
|
assert compare_clusters([[1.0, 2.0]], []) == False
|
|
|
|
|
|
class TestKMeansBinary:
|
|
def test_kmeans_binary_simple_case(self, euclidean_distance):
|
|
# Two well-separated clusters
|
|
points = [
|
|
[0.0, 0.0],
|
|
[0.1, 0.1],
|
|
[0.2, 0.0], # Cluster 1
|
|
[5.0, 5.0],
|
|
[5.1, 5.1],
|
|
[5.0, 5.2], # Cluster 2
|
|
]
|
|
|
|
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
|
|
|
|
# Check that centroids are reasonable
|
|
assert len(centroid1) == 2
|
|
assert len(centroid2) == 2
|
|
|
|
# One centroid should be near (0.1, 0.033), other near (5.033, 5.1)
|
|
c1_near_origin = abs(centroid1[0]) < 1 and abs(centroid1[1]) < 1
|
|
c2_near_origin = abs(centroid2[0]) < 1 and abs(centroid2[1]) < 1
|
|
|
|
# Exactly one should be near origin
|
|
assert c1_near_origin != c2_near_origin
|
|
|
|
def test_kmeans_binary_single_point(self, euclidean_distance):
|
|
points = [[1.0, 1.0]]
|
|
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
|
|
|
|
# Both centroids should be the single point
|
|
assert points_close(centroid1, [1.0, 1.0])
|
|
assert points_close(centroid2, []) # Empty cluster
|
|
|
|
def test_kmeans_binary_two_points(self, euclidean_distance):
|
|
points = [[0.0, 0.0], [2.0, 2.0]]
|
|
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
|
|
|
|
# Should converge to the two original points
|
|
centroids = [centroid1, centroid2]
|
|
assert any(points_close(c, [0.0, 0.0]) for c in centroids)
|
|
assert any(points_close(c, [2.0, 2.0]) for c in centroids)
|
|
|
|
def test_kmeans_binary_convergence(self, euclidean_distance):
|
|
# Test that algorithm converges within reasonable iterations
|
|
points = [
|
|
[i / 10.0, i / 10.0]
|
|
for i in range(5) # Points along diagonal
|
|
] + [
|
|
[i / 10.0 + 5, i / 10.0 + 5]
|
|
for i in range(5) # Shifted cluster
|
|
]
|
|
|
|
centroid1, centroid2 = KMeansBinary(
|
|
points, euclidean_distance, max_iterations=50
|
|
)
|
|
|
|
# Should produce two distinct clusters
|
|
distance_between_centroids = euclidean_distance(centroid1, centroid2)
|
|
assert distance_between_centroids > 2.0 # Should be well separated
|
|
|
|
def test_kmeans_binary_with_manhattan_distance(self, manhattan_distance):
|
|
points = [[0, 0], [1, 0], [0, 1], [10, 10], [11, 10], [10, 11]]
|
|
|
|
centroid1, centroid2 = KMeansBinary(points, manhattan_distance)
|
|
|
|
# Should separate into two clusters
|
|
assert len(centroid1) == 2
|
|
assert len(centroid2) == 2
|
|
|
|
# One centroid should be near origin, other near (10,10)
|
|
c1_near_origin = abs(centroid1[0]) < 5 and abs(centroid1[1]) < 5
|
|
c2_near_origin = abs(centroid2[0]) < 5 and abs(centroid2[1]) < 5
|
|
|
|
assert c1_near_origin != c2_near_origin
|
|
|
|
def test_kmeans_binary_max_iterations(self, euclidean_distance):
|
|
# Test that max_iterations parameter works
|
|
points = [[i, i] for i in range(10)]
|
|
|
|
# Should work with very few iterations
|
|
centroid1, centroid2 = KMeansBinary(
|
|
points, euclidean_distance, max_iterations=1
|
|
)
|
|
assert len(centroid1) == 2
|
|
assert len(centroid2) == 2
|
|
|
|
def test_kmeans_binary_3d_points(self, euclidean_distance):
|
|
# Test with 3D points
|
|
points = [
|
|
[0, 0, 0],
|
|
[1, 0, 0],
|
|
[0, 1, 0], # Near origin
|
|
[5, 5, 5],
|
|
[6, 5, 5],
|
|
[5, 6, 5], # Far from origin
|
|
]
|
|
|
|
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
|
|
|
|
assert len(centroid1) == 3
|
|
assert len(centroid2) == 3
|
|
|
|
# Should separate the two groups
|
|
c1_near_origin = all(abs(x) < 3 for x in centroid1)
|
|
c2_near_origin = all(abs(x) < 3 for x in centroid2)
|
|
|
|
assert c1_near_origin != c2_near_origin
|
|
|
|
|
|
class TestEdgeCases:
|
|
def test_identical_points(self, euclidean_distance):
|
|
# All points are identical
|
|
points = [[1.0, 1.0]] * 5
|
|
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
|
|
|
|
# Both centroids should be the same point
|
|
assert points_close(centroid1, [1.0, 1.0])
|
|
assert points_close(centroid2, [1.0, 1.0])
|
|
|
|
def test_collinear_points(self, euclidean_distance):
|
|
# Points on a line
|
|
points = [[i, 0] for i in range(6)]
|
|
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
|
|
|
|
# Should still produce two clusters
|
|
assert len(centroid1) == 2
|
|
assert len(centroid2) == 2
|
|
assert centroid1[1] == 0 # y-coordinate should be 0
|
|
assert centroid2[1] == 0 # y-coordinate should be 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__])
|