@Article{Dalton:2015:OSM, author = "Steven Dalton and Luke Olson and Nathan Bell", title = "Optimizing Sparse Matrix-Matrix Multiplication for the {GPU}", journal = "{ACM} Transactions on Mathematical Software", volume = "41", number = "4", accepted = "1 November 2014", upcoming = "true", abstract = " Sparse matrix-matrix multiplication (SpGEMM) is a key operation in numerous areas from information to the physical sciences. Implementing SpGEMM efficiently on throughput-oriented processors, such as the graphics processing unit (GPU), requires the programmer to expose substantial fine-grained parallelism while conserving the limited off-chip memory bandwidth. Balancing these concerns, we decompose the SpGEMM operation into three, highly-parallel phases: expansion, sorting, and contraction, and introduce a set of complementary bandwidth-saving performance optimizations. Our implementation is fully general and our optimization strategy adaptively processes the SpGEMM workload row-wise to substantially improve the performance by decreasing the work complexity and utilizing the memory hierarchy more effectively.", }