Octopus
kmeans_clustering.F90
Go to the documentation of this file.
1!! Copyright (C) 2024. A Buccheri.
2!!
3!! This program is free software; you can redistribute it and/or modify
4!! it under the terms of the GNU General Public License as published by
5!! the Free Software Foundation; either version 2, or (at your option)
6!! any later version.
7!!
8!! This program is distributed in the hope that it will be useful,
9!! but WITHOUT ANY WARRANTY; without even the implied warranty of
10!! MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11!! GNU General Public License for more details.
12!!
13!! You should have received a copy of the GNU General Public License
14!! along with this program; if not, write to the Free Software
15!! Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16!! 02110-1301, USA.
17
18#include "global.h"
19
21 use, intrinsic :: iso_fortran_env
22 use debug_oct_m
24 use global_oct_m
25 use mesh_oct_m
27 use mpi_oct_m
32 use space_oct_m
33 use sort_oct_m
35 implicit none
36 private
37
42
43contains
44
45 ! TODO(Alex) Issue #1004. Implement `assign_points_to_centroids` for periodic boundary conditions
46
76 subroutine assign_points_to_centroids_finite_bc(mesh, centroids, ip_to_ic)
77 class(mesh_t), intent(in) :: mesh
78 real(real64), intent(in) :: centroids(:, :)
79 integer, intent(out) :: ip_to_ic(:)
80
81 integer :: ip, ic, icen
82 integer :: n_centroids
83 real(real64) :: min_dist, dist
84 real(real64), allocatable :: point(:)
85
86 ! Some small finite tolerance is required to distinguish degenerate points, else
87 ! `if (dist < min_dist)` can vary on different hardware due to numerical noise.
88 ! One could equally choose tol to be some percentage of the grid spacing.
89 real(real64), parameter :: tol = 1.0e-13_real64
90
92
93 ! Grid to centroid index map should have size of the grid
94 assert(size(ip_to_ic) == mesh%np)
95
96 n_centroids = size(centroids, 2)
97 safe_allocate(point(1:size(centroids, 1)))
98
99 !$omp parallel do default(shared) private(point, icen, min_dist, dist)
100 do ip = 1, mesh%np
101 ip_to_ic(ip) = 0
102 ! Compute which centroid, grid point `ip` is closest to
103 point = mesh%x(ip, :)
104 icen = 1
105 min_dist = sum((centroids(:, 1) - point(:))**2)
106 do ic = 2, n_centroids
107 dist = sum((centroids(:, ic) - point(:))**2)
108 if (dist < min_dist - tol) then
109 min_dist = dist
110 icen = ic
111 endif
112 enddo
113 ip_to_ic(ip) = icen
114 enddo
115 !$omp end parallel do
116
117 safe_deallocate_a(point)
118
120
122
123
136 subroutine update_centroids(mesh, weight, ip_to_ic, centroids)
137 class(mesh_t), intent(in) :: mesh
138 real(real64), intent(in) :: weight(:)
139 integer, intent(in) :: ip_to_ic(:)
140 real(real64), contiguous, intent(inout) :: centroids(:, :)
141 ! Out: Updated centroid positions
142
143 integer :: n_centroids, ic, ip
144 real(real64) :: one_over_denom
145 real(real64), allocatable :: denominator(:)
146
147 push_sub(update_centroids)
148
149 ! The indexing of weight and grid must be consistent => must belong to the same spatial distribution
150 ! This does not explicit assert this, but if the grid sizes differ, this is a clear indicator of a problem
151 assert(mesh%np == size(weight))
152
153 n_centroids = size(centroids, 2)
154 safe_allocate(denominator(1:n_centroids))
155
156 do ic = 1, n_centroids
157 centroids(:, ic) = 0._real64
158 denominator(ic) = 0._real64
159 enddo
160
161 !$omp parallel do private(ic) reduction(+ : centroids, denominator)
162 do ip = 1, mesh%np
163 ic = ip_to_ic(ip)
164 ! Initially accumulate the numerator in `centroids`
165 centroids(:, ic) = centroids(:, ic) + (mesh%x(ip, :) * weight(ip))
166 denominator(ic) = denominator(ic) + weight(ip)
167 enddo
168 !$omp end parallel do
170 ! Gather contributions to numerator and denominator of all centroids, from all domains of the mesh/grid
171 call mesh%allreduce(centroids)
172 call mesh%allreduce(denominator)
173
174 ! If division by zero occurs here it implies that the sum(weight) = 0 for all grid points
175 ! in cluster ic. This can occur if the initial centroid is poorly chosen at a point with no
176 !! associated weight (such as the vacuum of a crystal cell)
177 !$omp parallel do private(one_over_denom) reduction(* : centroids)
178 do ic = 1, n_centroids
179 one_over_denom = 1._real64 / denominator(ic)
180 centroids(:, ic) = centroids(:, ic) * one_over_denom
181 enddo
182 !$omp end parallel do
183
184 safe_deallocate_a(denominator)
185 pop_sub(update_centroids)
186
187 end subroutine update_centroids
188
189
194 subroutine compute_grid_difference(points, updated_points, tol, points_differ)
195 real(real64), intent(in) :: points(:, :)
196 real(real64), intent(in) :: updated_points(:, :)
197 real(real64), intent(in) :: tol
198 logical, intent(out) :: points_differ(:)
199
200 integer :: ip, n_dim
201 real(real64), allocatable :: diff(:)
202
204
205 n_dim = size(points, 1)
206 allocate(diff(n_dim))
207
208 !$omp parallel do default(shared) private(diff)
209 do ip = 1, size(points, 2)
210 diff(:) = abs(updated_points(:, ip) - points(:, ip))
211 points_differ(ip) = any(diff > tol)
212 enddo
213 !$omp end parallel do
214
215 if(debug%info) then
216 call report_differences_in_grids(points, updated_points, tol, points_differ)
217 endif
218
220
221 end subroutine compute_grid_difference
222
223
225 subroutine report_differences_in_grids(points, updated_points, tol, points_differ)
226 real(real64), intent(in) :: points(:, :)
227 real(real64), intent(in) :: updated_points(:, :)
228 real(real64), intent(in) :: tol
229 logical, intent(in) :: points_differ(:)
230
231 integer, allocatable :: indices(:)
232 integer :: i, j, n_unconverged, ndim
233 character(len=50) :: f_string
234 real(real64), allocatable :: diff(:)
235
237
238 indices = pack([(i, i=1,size(points_differ))], points_differ)
239 n_unconverged = size(indices)
240 ndim = size(points, 1)
241 allocate(diff(ndim))
242
243 write(f_string, '(A, I1, A, I1, A, I1, A)') '(', &
244 & ndim, '(F16.10, X), ', &
245 & ndim, '(F16.10, X), ', &
246 & ndim, '(F16.10, X), F16.10)'
247
248 write(message(1), '(a)') "# Current Point , Prior Point , |ri - r_{i-1}| , tol"
249 call messages_info(1)
250 do j = 1, n_unconverged
251 i = indices(j)
252 diff(:) = abs(updated_points(:, i) - points(:, i))
253 write(message(1), f_string) updated_points(:, i), points(:, i), diff, tol
254 call messages_info(1)
255 enddo
256 write(message(1), *) "Summary:", n_unconverged, "of out", size(points, 2), "are not converged"
257 call messages_info(1)
258
260
261 end subroutine report_differences_in_grids
262
263
293 subroutine weighted_kmeans(space, mesh, weight, centroids, n_iter, centroid_tol, discretize, inertia)
294 class(space_t), intent(in) :: space
295 class(mesh_t), intent(in) :: mesh
296 real(real64), intent(in) :: weight(:)
297 real(real64), contiguous, intent(inout) :: centroids(:, :)
298 ! Out: Final centroids
299 integer, optional, intent(in ) :: n_iter
300 real(real64), optional, intent(in ) :: centroid_tol
301 logical, optional, intent(in ) :: discretize
302 real(real64), optional, intent(out) :: inertia
303
304 logical :: discretize_centroids
305 integer :: n_iterations, n_centroid, i
306 real(real64) :: tol
307 integer, allocatable :: ip_to_ic(:)
308 real(real64), allocatable :: prior_centroids(:, :)
309 logical, allocatable :: points_differ(:)
310
311 push_sub(weighted_kmeans)
312
313 n_iterations = optional_default(n_iter, 200)
314 tol = optional_default(centroid_tol, 1.e-4_real64)
315 discretize_centroids = optional_default(discretize, .true.)
316
317 ! Should use a positive number of iterations
318 assert(n_iterations >= 1)
319 ! Number of weights inconsistent with number of grid points
320 assert(size(weight) == mesh%np)
321 ! Spatial dimensions of centroids array is inconsistent
322 assert(size(centroids, 1) == space%dim)
323 ! Assignment of points to centroids only implemented for finite BCs
324 assert(.not. space%is_periodic())
325
326 ! Work arrays
327 n_centroid = size(centroids, 2)
328 safe_allocate_source(prior_centroids(space%dim, size(centroids, 2)), centroids)
329 safe_allocate(ip_to_ic(1:mesh%np))
330 safe_allocate(points_differ(1:n_centroid))
331
332 write(message(1), '(a)') 'Debug: Performing weighted Kmeans clustering '
333 call messages_info(1, debug_only=.true.)
334
335 do i = 1, n_iterations
336 write(message(1), '(a, I3)') 'Debug: Iteration ', i
337 call messages_info(1, debug_only=.true.)
338 ! TODO(Alex) Issue #1004. Implement `assign_points_to_centroids` for periodic boundary conditions
339 call assign_points_to_centroids_finite_bc(mesh, centroids, ip_to_ic)
340
341 call update_centroids(mesh, weight, ip_to_ic, centroids)
342 call compute_grid_difference(prior_centroids, centroids, tol, points_differ)
343
344 if (any(points_differ)) then
345 prior_centroids = centroids
346 else
347 write(message(1), '(a)') 'Debug: All centroid points converged'
348 call messages_info(1, debug_only=.true.)
349 ! Break loop
350 exit
351 endif
352
353 enddo
354
355 if (discretize_centroids) then
356 call mesh_discretize_values_to_mesh(mesh, centroids)
357 endif
358
359 if (present(inertia)) then
360 call compute_centroid_inertia(mesh, centroids, weight, ip_to_ic, inertia)
361 endif
362
363 safe_deallocate_a(prior_centroids)
364 safe_deallocate_a(ip_to_ic)
365 safe_deallocate_a(points_differ)
366
367 pop_sub(weighted_kmeans)
368
369 end subroutine weighted_kmeans
370
371
376 subroutine sample_initial_centroids(mesh, centroids, seed_value)
377 class(mesh_t), intent(in ) :: mesh
378 real(real64), contiguous, intent(out) :: centroids(:, :)
379 integer(int64), intent(inout), optional :: seed_value
380 ! This will get mutated by the Fisher Yates shuffle
381
382 integer(int32) :: n_centroids
383 integer(int64), allocatable :: centroid_idx(:)
384 integer(int64) :: ipg
385 integer(int32) :: ic
388
389 n_centroids = size(centroids, 2)
390 safe_allocate(centroid_idx(1:n_centroids))
391
392 ! Choose n_centroids indices from [1, np_global]
393 call fisher_yates_shuffle(n_centroids, mesh%np_global, seed_value, centroid_idx)
394
395 ! Convert ip_global to (x,y,z)
396 do ic = 1, n_centroids
397 ipg = centroid_idx(ic)
398 centroids(:, ic) = mesh_x_global(mesh, ipg)
399 enddo
400
401 safe_deallocate_a(centroid_idx)
402
404
405 end subroutine sample_initial_centroids
406
407
417 subroutine compute_centroid_inertia(mesh, centroids, weight, ip_to_ic, inertia)
418 class(mesh_t), intent(in) :: mesh
419 real(real64), intent(in) :: centroids(:, :)
420 real(real64), intent(in) :: weight(:)
421 integer, intent(in) :: ip_to_ic(:)
422 real(real64), intent(out) :: inertia
423 integer :: ip, ic
424
425 inertia = 0.0_real64
426
427 !$omp parallel do private(ip) reduction(+ : inertia)
428 do ip = 1, mesh%np
429 ic = ip_to_ic(ip)
430 inertia = inertia + weight(ip) * sum((centroids(:, ic) - mesh%x(ip, :))**2)
431 enddo
432 !$omp end parallel do
433
434 call mesh%allreduce(inertia)
435
436 end subroutine compute_centroid_inertia
437
439
440!! Local Variables:
441!! mode: f90
442!! coding: utf-8
443!! End:
type(debug_t), save, public debug
Definition: debug.F90:156
subroutine report_differences_in_grids(points, updated_points, tol, points_differ)
Report differences returned from compute_grid_difference.
subroutine, public weighted_kmeans(space, mesh, weight, centroids, n_iter, centroid_tol, discretize, inertia)
Weighted K-means clustering.
subroutine, public sample_initial_centroids(mesh, centroids, seed_value)
Sample initial centroids from the full mesh.
subroutine, public assign_points_to_centroids_finite_bc(mesh, centroids, ip_to_ic)
Assign each grid point to the closest centroid. A centroid and its set of nearest grid points defines...
subroutine, public update_centroids(mesh, weight, ip_to_ic, centroids)
Compute a new set of centroids.
subroutine compute_centroid_inertia(mesh, centroids, weight, ip_to_ic, inertia)
Compute the inertia of all centroids.
subroutine compute_grid_difference(points, updated_points, tol, points_differ)
Compute the difference in two grids as .
This module defines the meshes, which are used in Octopus.
Definition: mesh.F90:118
subroutine, public mesh_discretize_values_to_mesh(mesh, values)
Assign a set of values to their nearest discrete points on the mesh.
Definition: mesh.F90:412
real(real64) function, dimension(1:mesh%box%dim), public mesh_x_global(mesh, ipg)
Definition: mesh.F90:808
character(len=256), dimension(max_lines), public message
to be output by fatal, warning
Definition: messages.F90:160
subroutine, public messages_info(no_lines, iunit, debug_only, stress, all_nodes, namespace)
Definition: messages.F90:616
This module contains some common usage patterns of MPI routines.
Definition: mpi_lib.F90:115
subroutine, public fisher_yates_shuffle(m, n, seed, values)
Return m random numbers from a range of with no replacement.
Definition: quickrnd.F90:309
This module is intended to contain "only mathematical" functions and procedures.
Definition: sort.F90:117
Describes mesh distribution to nodes.
Definition: mesh.F90:186
int true(void)