.. _program_listing_file_src_lapjv.hpp: Program Listing for File lapjv.hpp ================================== |exhale_lsh| :ref:`Return to documentation for file ` (``src/lapjv.hpp``) .. |exhale_lsh| unicode:: U+021B0 .. UPWARDS ARROW WITH TIP LEFTWARDS .. code-block:: cpp /* This file is part of brille. Copyright © 2019,2020 Greg Tucker brille is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. brille is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with brille. If not, see . */ /* This file started life as lap.h from https://github.com/src-d/lapjv Licensed under The MIT License (MIT) Copyright (c) 2016 source{d}. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef BRILLE_LAPJV_H #define BRILLE_LAPJV_H #include #include #include #include #ifdef __GNUC__ #define always_inline __attribute__((always_inline)) inline #define restrict __restrict__ #elif _WIN32 #define always_inline __forceinline #define restrict __restrict #else #define always_inline inline #define restrict #endif namespace brille::assignment { using sidx = long long int; // in case idx is unsigned :/ template always_inline std::tuple find_umins_plain( idx dim, idx i, const cost *restrict assign_cost, const cost *restrict v) { const cost *local_cost = &assign_cost[i * dim]; cost umin = local_cost[0] - v[0]; sidx j1 = 0; sidx j2 = -1; cost usubmin = (std::numeric_limits::max)(); for (idx j = 1; j < dim; j++) { cost h = local_cost[j] - v[j]; if (h < usubmin) { if (h >= umin) { usubmin = h; j2 = static_cast(j); } else { usubmin = umin; umin = h; j2 = j1; j1 = static_cast(j); } } } return std::make_tuple(umin, usubmin, j1, j2); } // MSVC++ has an awful AVX2 support which does not allow to compile the code #if defined(__AVX2__) && !defined(_WIN32) #include #define FLOAT_MIN_DIM 64 #define DOUBLE_MIN_DIM 100000 // 64-bit code is actually always slower template always_inline std::tuple find_umins( idx dim, idx i, const float *restrict assign_cost, const float *restrict v) { if (dim < FLOAT_MIN_DIM) { return find_umins_plain(dim, i, assign_cost, v); } const float *local_cost = assign_cost + i * dim; __m256i idxvec = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); __m256i j1vec = _mm256_set1_epi32(-1), j2vec = _mm256_set1_epi32(-1); __m256 uminvec = _mm256_set1_ps((std::numeric_limits::max)()), usubminvec = _mm256_set1_ps((std::numeric_limits::max)()); for (idx j = 0; j < dim - 7; j += 8) { __m256 acvec = _mm256_loadu_ps(local_cost + j); __m256 vvec = _mm256_loadu_ps(v + j); __m256 h = _mm256_sub_ps(acvec, vvec); __m256 cmp = _mm256_cmp_ps(h, uminvec, _CMP_LE_OQ); usubminvec = _mm256_blendv_ps(usubminvec, uminvec, cmp); j2vec = _mm256_blendv_epi8( j2vec, j1vec, reinterpret_cast<__m256i>(cmp)); uminvec = _mm256_blendv_ps(uminvec, h, cmp); j1vec = _mm256_blendv_epi8( j1vec, idxvec, reinterpret_cast<__m256i>(cmp)); cmp = _mm256_andnot_ps(cmp, _mm256_cmp_ps(h, usubminvec, _CMP_LT_OQ)); usubminvec = _mm256_blendv_ps(usubminvec, h, cmp); j2vec = _mm256_blendv_epi8( j2vec, idxvec, reinterpret_cast<__m256i>(cmp)); idxvec = _mm256_add_epi32(idxvec, _mm256_set1_epi32(8)); } alignas(__m256) float uminmem[8], usubminmem[8]; alignas(__m256) int32_t j1mem[8], j2mem[8]; _mm256_store_ps(uminmem, uminvec); _mm256_store_ps(usubminmem, usubminvec); _mm256_store_si256(reinterpret_cast<__m256i*>(j1mem), j1vec); _mm256_store_si256(reinterpret_cast<__m256i*>(j2mem), j2vec); sidx j1 = -1, j2 = -1; float umin = (std::numeric_limits::max)(), usubmin = (std::numeric_limits::max)(); for (int vi = 0; vi < 8; vi++) { float h = uminmem[vi]; if (h < usubmin) { sidx jnew = static_cast(j1mem[vi]); if (h >= umin) { usubmin = h; j2 = jnew; } else { usubmin = umin; umin = h; j2 = j1; j1 = jnew; } } } for (int vi = 0; vi < 8; vi++) { float h = usubminmem[vi]; if (h < usubmin) { usubmin = h; j2 = static_cast(j2mem[vi]); } } for (idx j = dim & 0xFFFFFFF8u; j < dim; j++) { float h = local_cost[j] - v[j]; if (h < usubmin) { if (h >= umin) { usubmin = h; j2 = static_cast(j); } else { usubmin = umin; umin = h; j2 = j1; j1 = static_cast(j); } } } return std::make_tuple(umin, usubmin, j1, j2); } template always_inline std::tuple find_umins( idx dim, idx i, const double *restrict assign_cost, const double *restrict v) { if (dim < DOUBLE_MIN_DIM) { return find_umins_plain(dim, i, assign_cost, v); } const double *local_cost = assign_cost + i * dim; __m256i idxvec = _mm256_setr_epi64x(0, 1, 2, 3); __m256i j1vec = _mm256_set1_epi64x(-1), j2vec = _mm256_set1_epi64x(-1); __m256d uminvec = _mm256_set1_pd((std::numeric_limits::max)()), usubminvec = _mm256_set1_pd((std::numeric_limits::max)()); for (idx j = 0; j < dim - 3; j += 4) { __m256d acvec = _mm256_loadu_pd(local_cost + j); __m256d vvec = _mm256_loadu_pd(v + j); __m256d h = _mm256_sub_pd(acvec, vvec); __m256d cmp = _mm256_cmp_pd(h, uminvec, _CMP_LE_OQ); usubminvec = _mm256_blendv_pd(usubminvec, uminvec, cmp); j2vec = _mm256_blendv_epi8( j2vec, j1vec, reinterpret_cast<__m256i>(cmp)); uminvec = _mm256_blendv_pd(uminvec, h, cmp); j1vec = _mm256_blendv_epi8( j1vec, idxvec, reinterpret_cast<__m256i>(cmp)); cmp = _mm256_andnot_pd(cmp, _mm256_cmp_pd(h, usubminvec, _CMP_LT_OQ)); usubminvec = _mm256_blendv_pd(usubminvec, h, cmp); j2vec = _mm256_blendv_epi8( j2vec, idxvec, reinterpret_cast<__m256i>(cmp)); idxvec = _mm256_add_epi64(idxvec, _mm256_set1_epi64x(4)); } alignas(__m256d) double uminmem[4], usubminmem[4]; alignas(__m256d) int64_t j1mem[4], j2mem[4]; _mm256_store_pd(uminmem, uminvec); _mm256_store_pd(usubminmem, usubminvec); _mm256_store_si256(reinterpret_cast<__m256i*>(j1mem), j1vec); _mm256_store_si256(reinterpret_cast<__m256i*>(j2mem), j2vec); sidx j1 = -1, j2 = -1; double umin = (std::numeric_limits::max)(), usubmin = (std::numeric_limits::max)(); for (int vi = 0; vi < 4; vi++) { double h = uminmem[vi]; if (h < usubmin) { sidx jnew = static_cast(j1mem[vi]); if (h >= umin) { usubmin = h; j2 = jnew; } else { usubmin = umin; umin = h; j2 = j1; j1 = jnew; } } } for (int vi = 0; vi < 4; vi++) { double h = usubminmem[vi]; if (h < usubmin) { usubmin = h; j2 = static_cast(j2mem[vi]); } } for (idx j = dim & 0xFFFFFFFCu; j < dim; j++) { double h = local_cost[j] - v[j]; if (h < usubmin) { if (h >= umin) { usubmin = h; j2 = static_cast(j); } else { usubmin = umin; umin = h; j2 = j1; j1 = static_cast(j); } } } return std::make_tuple(umin, usubmin, j1, j2); } #else // __AVX__ #define find_umins find_umins_plain #endif // __AVX__ template cost lapjv(idx dim, const cost *restrict assign_cost, bool verbose, idx *restrict rowsol, idx *restrict colsol, cost *restrict u, cost *restrict v) { auto free = std::unique_ptr(new idx[dim]); // list of unassigned rows. auto collist = std::unique_ptr(new idx[dim]); // list of columns to be scanned in various ways. auto matches = std::unique_ptr(new idx[dim]); // counts how many times a row could be assigned. auto d = std::unique_ptr(new cost[dim]); // 'cost-distance' in augmenting path calculation. auto pred = std::unique_ptr(new idx[dim]); // row-predecessor of column in augmenting/alternating path. if (1==dim){ rowsol[0] = colsol[0] = static_cast(0); return assign_cost[0]; } // init how many times a row will be assigned in the column reduction. #if _OPENMP >= 201307 #pragma omp simd #endif for (idx i = 0; i < dim; i++) { matches[i] = 0; } // Following https://github.com/Fil/lap-jv/blob/master/lap.js // add an epsilon to comparisons to break infinite loops cost total_cost{0}; for (idx tc=0; tc(10000*dim); // COLUMN REDUCTION //for (idx j = dim - 1; j >= 0; j--) { // reverse order gives better results. for (idx j = dim; j-- > 0; ){ // find minimum cost over rows. cost min = assign_cost[j]; idx imin = 0; for (idx i = 1; i < dim; i++) { const cost *local_cost = &assign_cost[i * dim]; if (local_cost[j] < min) { min = local_cost[j]; imin = i; } } v[j] = min; if (++matches[imin] == 1) { // init assignment if minimum row assigned for first time. rowsol[imin] = j; colsol[j] = imin; } else { colsol[j] = static_cast(-1); // row already assigned, column not assigned. } } if (verbose) { printf("lapjv: COLUMN REDUCTION finished\n"); } // REDUCTION TRANSFER idx numfree = 0; for (idx i = 0; i < dim; i++) { const cost *local_cost = &assign_cost[i * dim]; if (matches[i] == 0) { // fill list of unassigned 'free' rows. free[numfree++] = i; } else if (matches[i] == 1) { // transfer reduction from rows that are assigned once. idx j1 = rowsol[i]; cost min = (std::numeric_limits::max)(); for (idx j = 0; j < dim; j++) { if (j != j1) { if (local_cost[j] - v[j] < min + cost_epsilon) { min = local_cost[j] - v[j]; } } } v[j1] = v[j1] - min; } } if (verbose) { printf("lapjv: REDUCTION TRANSFER finished\n"); } // AUGMENTING ROW REDUCTION for (int loopcnt = 0; loopcnt < 2; loopcnt++) { // loop to be done twice. // scan all free rows. // in some cases, a free row may be replaced with another one to be scanned next. idx k = 0; idx prevnumfree = numfree; numfree = 0; // start list of rows still free after augmenting row reduction. while (k < prevnumfree) { idx i = free[k++]; // find minimum and second minimum reduced cost over columns. cost umin, usubmin; sidx j1, j2; std::tie(umin, usubmin, j1, j2) = find_umins(dim, i, assign_cost, v); idx i0 = colsol[j1]; cost vj1_new = v[j1] - (usubmin + cost_epsilon - umin); bool vj1_lowers = vj1_new < v[j1]; // the trick to eliminate the epsilon bug if (vj1_lowers) { // change the reduction of the minimum column to increase the minimum // reduced cost in the row to the subminimum. v[j1] = vj1_new; } else if (i0 != static_cast(-1)) { // (was >=0) minimum and subminimum equal. // minimum column j1 is assigned. // swap columns j1 and j2, as j2 may be unassigned. j1 = j2; i0 = colsol[j2]; } // (re-)assign i to j1, possibly de-assigning an i0. rowsol[i] = static_cast(j1); colsol[j1] = i; if (i0 != static_cast(-1)) { // (was >=0) minimum column j1 assigned earlier. if (vj1_lowers) { // put in current k, and go back to that k. // continue augmenting path i - j1 with i0. free[--k] = i0; } else { // no further augmenting reduction possible. // store i0 in list of free rows for next phase. free[numfree++] = i0; } } } if (verbose) { printf("lapjv: AUGMENTING ROW REDUCTION %d / %d\n", loopcnt + 1, 2); } } // for loopcnt // AUGMENT SOLUTION for each free row. for (idx f = 0; f < numfree; f++) { idx endofpath{0}; idx freerow = free[f]; // start row of augmenting path. if (verbose) { printf("lapjv: AUGMENT SOLUTION row %d [%d / %d]\n", freerow, f + 1, numfree); } // Dijkstra shortest path algorithm. // runs until unassigned column added to shortest path tree. #if _OPENMP >= 201307 #pragma omp simd #endif for (idx j = 0; j < dim; j++) { d[j] = assign_cost[freerow * dim + j] - v[j]; pred[j] = freerow; collist[j] = j; // init column list. } idx low = 0; // columns in 0..low-1 are ready, now none. idx up = 0; // columns in low..up-1 are to be scanned for current minimum, now none. // columns in up..dim-1 are to be considered later to find new minimum, // at this stage the list simply contains all columns bool unassigned_found = false; // initialized in the first iteration: low == up == 0 sidx last = 0; cost min = 0; do { if (up == low) { // no more columns to be scanned for current minimum. last = static_cast(low) - 1; // scan columns for up..dim-1 to find all indices for which new minimum occurs. // store these indices between low..up-1 (increasing up). min = d[collist[up++]]; for (idx k = up; k < dim; k++) { idx j = collist[k]; cost h = d[j]; if (h <= min) { if (h < min) { // new minimum. up = low; // restart list at index low. min = h; } // new index with same minimum, put on undex up, and extend list. collist[k] = collist[up]; collist[up++] = j; } } // check if any of the minimum columns happens to be unassigned. // if so, we have an augmenting path right away. for (idx k = low; k < up; k++) { if (colsol[collist[k]] == static_cast(-1)) { // (was <0) endofpath = collist[k]; unassigned_found = true; break; } } } if (!unassigned_found) { // update 'distances' between freerow and all unscanned columns, via next scanned column. idx j1 = collist[low]; low++; idx i = colsol[j1]; const cost *local_cost = &assign_cost[i * dim]; cost h = local_cost[j1] - v[j1] - min; for (idx k = up; k < dim; k++) { idx j = collist[k]; cost v2 = local_cost[j] - v[j] - h; if (v2 < d[j]) { pred[j] = i; if (v2 == min) { // new column found at same minimum value if (colsol[j] == static_cast(-1)) { //(was <0) // if unassigned, shortest augmenting path is complete. endofpath = j; unassigned_found = true; break; } else { // else add to list to be scanned right away. collist[k] = collist[up]; collist[up++] = j; } } d[j] = v2; } } } } while (!unassigned_found); // update column prices. #if _OPENMP >= 201307 #pragma omp simd #endif for (sidx k = 0; k <= last; k++) { idx j1 = collist[k]; v[j1] = v[j1] + d[j1] - min; } // reset row and column assignments along the alternating path. { idx i; do { i = pred[endofpath]; colsol[endofpath] = i; idx j1 = endofpath; endofpath = rowsol[i]; rowsol[i] = j1; } while (i != freerow); } } if (verbose) { printf("lapjv: AUGMENT SOLUTION finished\n"); } // calculate optimal cost. cost lapcost = 0; #if _OPENMP >= 201307 #pragma omp simd reduction(+:lapcost) #endif for (idx i = 0; i < dim; i++) { const cost *local_cost = &assign_cost[i * dim]; idx j = rowsol[i]; u[i] = local_cost[j] - v[j]; lapcost += local_cost[j]; } if (verbose) { printf("lapjv: optimal cost calculated\n"); } return lapcost; } } // end namespace brille::assignment #endif