1
0
python-vector-database/src/tests/test_kmeans.py

268 lines
8.7 KiB
Python

import pytest
from knn import (
KMeansBinary,
Point,
calculate_centroid,
classify_point_on_clusters,
compare_clusters,
vector_scalar_multiplication,
vector_sum,
)
# Test fixtures and helper functions
@pytest.fixture
def euclidean_distance():
def distance(p1: Point, p2: Point) -> float:
return (sum((a - b) ** 2 for a, b in zip(p1, p2)))**0.5
return distance
@pytest.fixture
def manhattan_distance():
def distance(p1: Point, p2: Point) -> float:
return sum(abs(a - b) for a, b in zip(p1, p2))
return distance
def points_close(p1: Point, p2: Point, tolerance: float = 1e-6) -> bool:
"""Check if two points are approximately equal"""
return all(abs(a - b) < tolerance for a, b in zip(p1, p2))
class TestVectorOperations:
def test_vector_scalar_multiplication(self):
vector = [1.0, 2.0, 3.0]
result = vector_scalar_multiplication(2, vector)
assert result == [2.0, 4.0, 6.0]
# Test with zero scalar
result = vector_scalar_multiplication(0, vector)
assert result == [0.0, 0.0, 0.0]
# Test with negative scalar
result = vector_scalar_multiplication(-1.5, vector)
assert result == [-1.5, -3.0, -4.5]
def test_vector_sum(self):
a = [1.0, 2.0, 3.0]
b = [4.0, 5.0, 6.0]
result = vector_sum(a, b)
assert result == [5.0, 7.0, 9.0]
# Test with zeros
zero = [0.0, 0.0, 0.0]
result = vector_sum(a, zero)
assert result == a
def test_vector_sum_different_lengths(self):
a = [1.0, 2.0]
b = [3.0, 4.0, 5.0]
with pytest.raises(AssertionError):
vector_sum(a, b)
class TestCentroid:
def test_calculate_centroid_simple(self):
points = [[0.0, 0.0], [2.0, 0.0], [1.0, 2.0]]
centroid = calculate_centroid(points)
expected = [1.0, 2.0 / 3.0]
assert points_close(centroid, expected)
def test_calculate_centroid_single_point(self):
points = [[3.0, 4.0]]
centroid = calculate_centroid(points)
assert points_close(centroid, [3.0, 4.0])
def test_calculate_centroid_empty_list(self):
points = []
centroid = calculate_centroid(points)
assert centroid == []
def test_calculate_centroid_identical_points(self):
points = [[1.0, 1.0], [1.0, 1.0], [1.0, 1.0]]
centroid = calculate_centroid(points)
assert points_close(centroid, [1.0, 1.0])
class TestClassification:
def test_classify_point_on_clusters(self, euclidean_distance):
point = [1.0, 1.0]
centers = ([0.0, 0.0], [3.0, 3.0])
# Point should be closer to first center
result = classify_point_on_clusters(point, centers, euclidean_distance)
assert result == True
# Test point closer to second center
point = [2.5, 2.5]
result = classify_point_on_clusters(point, centers, euclidean_distance)
assert result == False
def test_classify_point_equidistant(self, euclidean_distance):
point = [1.5, 1.5]
centers = ([0.0, 0.0], [3.0, 3.0])
# Point is equidistant, should return False (>=)
result = classify_point_on_clusters(point, centers, euclidean_distance)
assert result == False
class TestClusterComparison:
def test_compare_clusters_identical(self):
cluster_a = [[1.0, 2.0], [3.0, 4.0]]
cluster_b = [[1.0, 2.0], [3.0, 4.0]]
assert compare_clusters(cluster_a, cluster_b) == True
def test_compare_clusters_different_order(self):
cluster_a = [[1.0, 2.0], [3.0, 4.0]]
cluster_b = [[3.0, 4.0], [1.0, 2.0]]
assert compare_clusters(cluster_a, cluster_b) == True
def test_compare_clusters_different(self):
cluster_a = [[1.0, 2.0], [3.0, 4.0]]
cluster_b = [[1.0, 2.0], [5.0, 6.0]]
assert compare_clusters(cluster_a, cluster_b) == False
def test_compare_clusters_empty(self):
assert compare_clusters([], []) == True
assert compare_clusters([[1.0, 2.0]], []) == False
class TestKMeansBinary:
def test_kmeans_binary_simple_case(self, euclidean_distance):
# Two well-separated clusters
points = [
[0.0, 0.0],
[0.1, 0.1],
[0.2, 0.0], # Cluster 1
[5.0, 5.0],
[5.1, 5.1],
[5.0, 5.2], # Cluster 2
]
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
# Check that centroids are reasonable
assert len(centroid1) == 2
assert len(centroid2) == 2
# One centroid should be near (0.1, 0.033), other near (5.033, 5.1)
c1_near_origin = abs(centroid1[0]) < 1 and abs(centroid1[1]) < 1
c2_near_origin = abs(centroid2[0]) < 1 and abs(centroid2[1]) < 1
# Exactly one should be near origin
assert c1_near_origin != c2_near_origin
def test_kmeans_binary_single_point(self, euclidean_distance):
points = [[1.0, 1.0]]
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
# Both centroids should be the single point
assert points_close(centroid1, [1.0, 1.0])
assert points_close(centroid2, []) # Empty cluster
def test_kmeans_binary_two_points(self, euclidean_distance):
points = [[0.0, 0.0], [2.0, 2.0]]
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
# Should converge to the two original points
centroids = [centroid1, centroid2]
assert any(points_close(c, [0.0, 0.0]) for c in centroids)
assert any(points_close(c, [2.0, 2.0]) for c in centroids)
def test_kmeans_binary_convergence(self, euclidean_distance):
# Test that algorithm converges within reasonable iterations
points = [
[i / 10.0, i / 10.0]
for i in range(5) # Points along diagonal
] + [
[i / 10.0 + 5, i / 10.0 + 5]
for i in range(5) # Shifted cluster
]
centroid1, centroid2 = KMeansBinary(
points, euclidean_distance, max_iterations=50
)
# Should produce two distinct clusters
distance_between_centroids = euclidean_distance(centroid1, centroid2)
assert distance_between_centroids > 2.0 # Should be well separated
def test_kmeans_binary_with_manhattan_distance(self, manhattan_distance):
points = [[0, 0], [1, 0], [0, 1], [10, 10], [11, 10], [10, 11]]
centroid1, centroid2 = KMeansBinary(points, manhattan_distance)
# Should separate into two clusters
assert len(centroid1) == 2
assert len(centroid2) == 2
# One centroid should be near origin, other near (10,10)
c1_near_origin = abs(centroid1[0]) < 5 and abs(centroid1[1]) < 5
c2_near_origin = abs(centroid2[0]) < 5 and abs(centroid2[1]) < 5
assert c1_near_origin != c2_near_origin
def test_kmeans_binary_max_iterations(self, euclidean_distance):
# Test that max_iterations parameter works
points = [[i, i] for i in range(10)]
# Should work with very few iterations
centroid1, centroid2 = KMeansBinary(
points, euclidean_distance, max_iterations=1
)
assert len(centroid1) == 2
assert len(centroid2) == 2
def test_kmeans_binary_3d_points(self, euclidean_distance):
# Test with 3D points
points = [
[0, 0, 0],
[1, 0, 0],
[0, 1, 0], # Near origin
[5, 5, 5],
[6, 5, 5],
[5, 6, 5], # Far from origin
]
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
assert len(centroid1) == 3
assert len(centroid2) == 3
# Should separate the two groups
c1_near_origin = all(abs(x) < 3 for x in centroid1)
c2_near_origin = all(abs(x) < 3 for x in centroid2)
assert c1_near_origin != c2_near_origin
class TestEdgeCases:
def test_identical_points(self, euclidean_distance):
# All points are identical
points = [[1.0, 1.0]] * 5
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
# Both centroids should be the same point
assert points_close(centroid1, [1.0, 1.0])
assert points_close(centroid2, [1.0, 1.0])
def test_collinear_points(self, euclidean_distance):
# Points on a line
points = [[i, 0] for i in range(6)]
centroid1, centroid2 = KMeansBinary(points, euclidean_distance)
# Should still produce two clusters
assert len(centroid1) == 2
assert len(centroid2) == 2
assert centroid1[1] == 0 # y-coordinate should be 0
assert centroid2[1] == 0 # y-coordinate should be 0
if __name__ == "__main__":
pytest.main([__file__])