/* * Copyright (c) 2020 NVIDIA Corporation. * Copyright (c) 2018-2020 Chris Choy (chrischoy@ai.stanford.edu). * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. * * Please cite "4D Spatio-Temporal ConvNets: Minkowski Convolutional Neural * Networks", CVPR'19 (https://arxiv.org/abs/1904.08755) if you use any part * of the code. */ #ifndef KERNEL_MAP_CUH #define KERNEL_MAP_CUH #include "3rdparty/hash/hash_allocator.cuh" #include "coordinate_map_functors.cuh" #include "storage.cuh" #include "types.hpp" #include #include #include #include #include #include #include #include namespace minkowski { template class gpu_kernel_map { public: using size_type = default_types::size_type; using byte_allocator_type = ByteAllocator; using self_type = gpu_kernel_map; class contiguous_memory { public: using self_type = contiguous_memory; public: contiguous_memory() = delete; contiguous_memory(gpu_kernel_map &kernel_map) : m_kernel_map{kernel_map} {} self_type &operator=(self_type const &other) { m_data = other.m_data; m_kernel_map = other.m_kernel_map; return *this; } inline typename std::map::const_iterator key_cbegin() const { return m_kernel_map.m_kernel_offset_map.cbegin(); } inline typename std::map::const_iterator key_cend() const { return m_kernel_map.m_kernel_offset_map.cend(); } inline index_type *data() { return m_data; } inline index_type const *cdata() const { return m_data; } inline void data(index_type *p_data) { m_data = p_data; } inline index_type *begin() const { return m_data; } inline index_type *begin(index_type kernel_index) const { auto const &offset_map = m_kernel_map.m_kernel_offset_map; auto const iter = offset_map.find(kernel_index); if (iter == offset_map.end()) { // LOG_WARN("gpu_kernel_map for kernel", kernel_index, "not found."); return m_data; } else { return m_data + iter->second; } } inline index_type *end() const { return m_data + m_kernel_map.m_capacity; } inline index_type *end(index_type kernel_index) const { auto const &offset_map = m_kernel_map.m_kernel_offset_map; auto const iter = offset_map.find(kernel_index); if (iter == offset_map.end()) { // LOG_WARN("gpu_kernel_map for kernel", kernel_index, "not found."); return m_data + m_kernel_map.m_capacity; } else { return m_data + iter->second + m_kernel_map.m_kernel_size_map[kernel_index]; } } size_type size(index_type kernel_index) const { auto const &size_map = m_kernel_map.m_kernel_size_map; auto const iter = size_map.find(kernel_index); if (iter == size_map.end()) return 0; return iter->second; } private: index_type *m_data; gpu_kernel_map &m_kernel_map; }; // end contiguous_memory public: gpu_kernel_map() : m_memory_size_byte{0}, m_capacity{0}, kernels{*this}, in_maps{*this}, out_maps{*this} { LOG_DEBUG("Initialized gpu_kernel_map"); } gpu_kernel_map(self_type const &other) : m_decomposed(other.m_decomposed), // m_requires_kernel_index(other.m_requires_kernel_index), // m_memory_size_byte(other.m_memory_size_byte), // m_capacity{other.m_capacity}, // m_in_map_memory{other.m_in_map_memory}, // m_out_map_memory{other.m_out_map_memory}, // m_allocator{other.m_allocator}, // m_kernel_size_map{other.m_kernel_size_map}, // m_kernel_offset_map{other.m_kernel_offset_map}, // kernels{*this}, // in_maps{*this}, // out_maps{*this} { LOG_DEBUG("gpu_kernel_map copy constructor"); in_maps.data(other.in_maps.begin()); out_maps.data(other.out_maps.begin()); if (m_requires_kernel_index) { m_kernel_index_memory = other.m_kernel_index_memory; kernels.data(other.kernels.begin()); } } gpu_kernel_map(size_type capacity, byte_allocator_type alloc = byte_allocator_type(), bool requires_kernel_index = true) : m_requires_kernel_index(requires_kernel_index), m_capacity{capacity}, m_allocator{alloc}, kernels{*this}, in_maps{*this}, out_maps{*this} { // kernel map without kernel index m_memory_size_byte = capacity * sizeof(index_type); index_type *ptr_in_map = reinterpret_cast( m_allocator.allocate(m_memory_size_byte)); index_type *ptr_out_map = reinterpret_cast( m_allocator.allocate(m_memory_size_byte)); index_type *ptr_kernel = nullptr; auto deleter = [](index_type *p, byte_allocator_type alloc, size_type size) { alloc.deallocate(reinterpret_cast(p), size); LOG_DEBUG("Deallocate kernel map"); }; m_in_map_memory = std::shared_ptr{ ptr_in_map, std::bind(deleter, std::placeholders::_1, m_allocator, m_memory_size_byte)}; m_out_map_memory = std::shared_ptr{ ptr_out_map, std::bind(deleter, std::placeholders::_1, m_allocator, m_memory_size_byte)}; // kernel maps in_maps.data(m_in_map_memory.get()); out_maps.data(m_out_map_memory.get()); if (requires_kernel_index) { ptr_kernel = reinterpret_cast( m_allocator.allocate(m_memory_size_byte)); m_kernel_index_memory = std::shared_ptr{ ptr_kernel, std::bind(deleter, std::placeholders::_1, m_allocator, m_memory_size_byte)}; kernels.data(m_kernel_index_memory.get()); } else { m_kernel_offset_map[0] = 0; m_kernel_size_map[0] = capacity; // Initialize the decomposed begins and sizes m_decomposed = true; } } self_type swap() const { self_type swapped_gpu_kernel_map(*this); swapped_gpu_kernel_map.in_maps.data( swapped_gpu_kernel_map.m_out_map_memory.get()); swapped_gpu_kernel_map.out_maps.data( swapped_gpu_kernel_map.m_in_map_memory.get()); #ifdef DEBUG size_type map_size = std::min(in_maps.size(0), 100); index_type *p_kernel_map = (index_type *)std::malloc(map_size * 3 * sizeof(index_type)); // CUDA_CHECK(cudaMemcpy(p_kernel_map, kernels.begin(), // map_size * sizeof(index_type), // cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(p_kernel_map + 1 * map_size, in_maps.begin(), map_size * sizeof(index_type), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(p_kernel_map + 2 * map_size, out_maps.begin(), map_size * sizeof(index_type), cudaMemcpyDeviceToHost)); for (index_type i = 0; i < map_size; ++i) { std::cout // << p_kernel_map[i + 0 * map_size] << ":" << p_kernel_map[i + 1 * map_size] << "->" << p_kernel_map[i + 2 * map_size] << "\n"; } std::cout << "Swapped kernel map\n"; // CUDA_CHECK(cudaMemcpy(p_kernel_map, // swapped_gpu_kernel_map.kernels.begin(), // map_size * sizeof(index_type), // cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy( p_kernel_map + 1 * map_size, swapped_gpu_kernel_map.in_maps.begin(), map_size * sizeof(index_type), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy( p_kernel_map + 2 * map_size, swapped_gpu_kernel_map.out_maps.begin(), map_size * sizeof(index_type), cudaMemcpyDeviceToHost)); for (index_type i = 0; i < map_size; ++i) { std::cout // << p_kernel_map[i + 0 * map_size] << ":" << p_kernel_map[i + 1 * map_size] << "->" << p_kernel_map[i + 2 * map_size] << "\n"; } CUDA_CHECK(cudaDeviceSynchronize()); std::free(p_kernel_map); #endif return swapped_gpu_kernel_map; } self_type &operator=(self_type const &other) { m_decomposed = other.m_decomposed; m_requires_kernel_index = other.m_requires_kernel_index; m_memory_size_byte = other.m_memory_size_byte; m_capacity = other.m_capacity; m_kernel_index_memory = other.m_kernel_index_memory; m_in_map_memory = other.m_in_map_memory; m_out_map_memory = other.m_out_map_memory; m_allocator = other.m_allocator; m_kernel_size_map = other.m_kernel_size_map; m_kernel_offset_map = other.m_kernel_offset_map; in_maps.data(other.in_maps.begin()); out_maps.data(other.out_maps.begin()); kernels.data(other.kernels.begin()); return *this; } // functions inline typename std::map::const_iterator key_cbegin() const { return m_kernel_offset_map.cbegin(); } inline typename std::map::const_iterator key_cend() const { return m_kernel_offset_map.cend(); } size_type size() const { size_type nmap = 0; for (auto const &k : m_kernel_size_map) { nmap += k.second; } return nmap; } size_type volume() const { return m_kernel_size_map.size(); } size_type max_size() const { size_type nmap = 0; for (auto const &k : m_kernel_size_map) { if (k.second > nmap) nmap = k.second; } return nmap; } size_type size(index_type const kernel_index) const { auto const iter = m_kernel_size_map.find(kernel_index); if (iter == m_kernel_size_map.end()) return 0; return iter->second; } std::string to_string() const { Formatter o; size_type map_size = 0; for (auto const &kv : m_kernel_size_map) { map_size += kv.second; } o << "gpu_kernel_map: number of unique maps:" << m_kernel_size_map.size() << ", kernel map size:" << map_size; return o.str(); } void decompose() { LOG_DEBUG("Decomposing", kernels.end() - kernels.begin(), "elements"); // the memory space must be initialized first! // sort THRUST_CHECK(thrust::sort_by_key(thrust::device, // kernels.begin(), // key begin kernels.end(), // key end thrust::make_zip_iterator( // value begin thrust::make_tuple( // in_maps.begin(), // out_maps.begin() // ) // ))); #ifdef DEBUG size_type map_size = std::min(in_maps.end() - in_maps.begin(), 100); LOG_DEBUG("printing", map_size, "kernel maps"); index_type *p_kernel_map = (index_type *)std::malloc(map_size * 3 * sizeof(index_type)); CUDA_CHECK(cudaMemcpy(p_kernel_map, m_kernel_index_memory.get(), map_size * sizeof(index_type), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(p_kernel_map + map_size, m_in_map_memory.get(), map_size * sizeof(index_type), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(p_kernel_map + 2 * map_size, m_out_map_memory.get(), map_size * sizeof(index_type), cudaMemcpyDeviceToHost)); for (index_type i = 0; i < map_size; ++i) { std::cout << p_kernel_map[i + 0 * map_size] << ":" << p_kernel_map[i + 1 * map_size] << "->" << p_kernel_map[i + 2 * map_size] << "\n"; } std::free(p_kernel_map); #endif // Need to find the start and the size of each key for the kernel map // generation. thrust::counting_iterator min_begin{0}; thrust::constant_iterator size_begin{1}; gpu_storage out_key(m_capacity); gpu_storage out_key_min(m_capacity); gpu_storage out_key_size(m_capacity); size_type num_unique_keys; try { auto end = thrust::reduce_by_key( thrust::device, // policy kernels.begin(), // key begin kernels.end(), // key end thrust::make_zip_iterator( thrust::make_tuple(min_begin, size_begin)), // value begin out_key.begin(), // key out begin thrust::make_zip_iterator(thrust::make_tuple( out_key_min.begin(), out_key_size.begin())), // value out begin thrust::equal_to(), // key equal binary predicate detail::min_size_functor() // value binary operator ); num_unique_keys = end.first - out_key.begin(); LOG_DEBUG(num_unique_keys, "unique kernel map keys found"); } THRUST_CATCH; auto const cpu_out_keys = out_key.to_vector(num_unique_keys); auto const cpu_out_offset = out_key_min.to_vector(num_unique_keys); auto const cpu_out_size = out_key_size.to_vector(num_unique_keys); // thrust::host_vector cpu_out_keys( // out_key.begin(), out_key.begin() + num_unique_keys); // thrust::host_vector cpu_out_offset( // out_key_min.begin(), out_key_min.begin() + num_unique_keys); // thrust::host_vector cpu_out_size( // out_key_size.begin(), out_key_size.begin() + num_unique_keys); #ifdef DEBUG LOG_DEBUG("Printing cpu keys"); LOG_DEBUG("Keys:", cpu_out_keys); LOG_DEBUG("Mins:", cpu_out_offset); LOG_DEBUG("Size:", cpu_out_size); #endif // create an unordered map for (index_type i = 0; i < num_unique_keys; ++i) { m_kernel_offset_map[cpu_out_keys[i]] = cpu_out_offset[i]; m_kernel_size_map[cpu_out_keys[i]] = cpu_out_size[i]; } // Initialize the decomposed begins and sizes m_decomposed = true; } friend std::ostream &operator<<(std::ostream &out, self_type const &kernel_map) { out << kernel_map.to_string(); return out; } private: bool m_decomposed{false}; bool m_requires_kernel_index; size_type m_memory_size_byte, m_capacity; std::shared_ptr m_kernel_index_memory; std::shared_ptr m_in_map_memory; std::shared_ptr m_out_map_memory; byte_allocator_type m_allocator; std::map m_kernel_size_map; std::map m_kernel_offset_map; public: contiguous_memory kernels; contiguous_memory in_maps; contiguous_memory out_maps; }; // gpu_kernel_map } // namespace minkowski #endif // KERNEL_MAP_CUH