python编写矩阵乘法函数,C语言实现矩阵乘法

　　Python原代码：用np.random.randint从1到100随机生成两个100*100的数组，做矩阵乘法。

　　import numpy as NP import time from numba import jitarr _ a=NP . random . randint(1，100，10000)。reshape((100，100))arr_b=np.random.randint(1，100，10000)。reshape((100，100))def multiply(Arr _ a，Arr _ b):RES=NP . zeros((Arr _ a . shape[0]，Arr _ b . shape[1]))for I in range(Arr _ a . shape[0]):for j in range(Arr _ b . shape[1]):for k in range(Arr _ b . shape[0]):RES[I，j]=arr_a[i，k] * arr_b[k，j]return start=time . time()time ()-start))结果：[[243963 . 16363636366.263299.27155558586 [236866.255407.244862.228297.227939.244239.] [236049.245324.260419.249052.252747.254496.].[243759.258651.232168.225277.235829.246914.] [254883.277709.260438.254744.260553.265045.] [267688.286480.282399 .261162.267804.277926.]]花了1.014286638641357 1。使用Numba Numba官网：Numba:高性能Python编译器

　　Numba是Python的实时编译器，最适合使用NumPy 数组和函数以及循环的代码。使用Numba最常见的方式是通过它的decorator集合(应该是一些@注解)，该集合可以应用于函数，以指示Numba编译它们。Numba-decorated函数在被调用时，被编译成机器码“just-in-time”执行，然后代码就可以以原生代码的速度运行了！

　　如果你的代码面向数字科学运算(做大量的数学运算)，大量使用NumPy和/或有很多循环，那么Numba通常是一个不错的选择。

　　加速方法：在方法上面加@jit注解

　　@jit的作用：Numba提供了几个代码生成的实用工具，其核心特性是Numba.jit()。有了这个修饰符，你就可以标记一个函数，通过Numba的JIT编译器用cuda优化它。Jit()有几个参数来控制不同的模式。有两种不同的编译模式，即nopython模式和object模式。Nopython编译模式的行为基本上就是编译一个修饰函数，让它完全不需要python解释器的参与就可以运行。这是最常用的方法。

　　使用nopthon模式：在jit参数中使nopython=True以启用该模式。

　　import numpy as NP import time from numba import jitarr _ a=NP . random . randint(1，100，10000)。reshape((100，100))arr_b=np.random.randint(1，100，10000)。shape((100，100))@ JIT(nopython=True)def multiply(Arr _ a，Arr _ b):RES=NP . zeros((Arr _ a . shape[0]，Arr _ b . shape[1]))for I in range(Arr _ a . shape[0]):for j in range(Arr _ b . shape[1]):for k in range(Arr _ b . shape[0]):RES[I，j]=arr_a[i，k] * arr_b[ktime ()-start))结果：[[212568 . 26386386386.............[230020.224216.253386.219960.235063.259665.] [211376.216862.239213.213518.222902.231084.].[221239.250413.260120.245681.238919.257253.] [224442.209056.244029.234404.227210.264708.] [220948.223777.253604 .229385.238134.245019.]]耗时0.426864862420166 2。使用Pycuda Pycuda文档：设备接口-PyCUDA 2020.1文档

　　演示参考：PyCUDA矩阵乘法的精度代码

　　使用Pycuda需要自己编写C /C的内核函数，然后通过get_function调用SourceModule中的内核函数。代码如下：

　　进口py cuda。驱动程序为cudaimport py cuda。工具导入py cuda。autoinitimport numpy作为NP导入numpy。linalg作为la从py cuda。编译器导入源模块导入时间mod=源模块( _ _ global _ _ void matrix multiply(float * A，float * B，float * C，int A_shape_0，int A_shape_1，int B _ shape _ 1){ float cValue=0；int Row=block idx。y *块变暗。y threadidx。y；int Col=块idx。x *块尺寸。x threadidx。x；if((Row A _ shape _ 0)(Col B _ shape _ 1)){ for(int k=0；k A _ shape _ 1；k){ cValue=A[Row * A _ shape _ 1k]* B[k * B _ shape _ 1 Col]；} C[Row * B _ shape _ 1 Col]=cValue；} } )矩阵乘法=mod。get _ function(矩阵乘法)n=100 a=NP。随机的。randint(0，100，10000).重塑(100,100)。astype(NP。float 32)B=NP。随机的。randint(0，100，10000).重塑(100,100)。astype(NP。float 32)C=NP。零((100，100)).astype(NP。float 32)BLOCK _ SIZE=10 #在设备上申请存储空间a _ GPU=cuda。mem _ alloc(a . nbytes)B _ GPU=cuda。mem _ alloc(B . nbytes)C _ GPU=cuda。mem _ alloc(C . nbytes)#将数组从宿主拷贝到显卡cuda.memcpy_htod(A_gpu，A)cuda.memcpy_htod(B_gpu，B)#设定格子大小if n%BLOCK_SIZE！=0: grid=(n//BLOCK_SIZE 1，n//BLOCK_SIZE 1，1)else: grid=(n//BLOCK_SIZE，n//BLOCK_SIZE，1)#调用GPU函数start=time。time()矩阵乘法(A _ GPU，B_gpu，C_gpu，np.int32(A.shape[0])，np.int32(A.shape[1])，np.int32(B.shape[1])，block=(BLOCK_SIZE，BLOCK_SIZE，1#复制回resultcuda.memcpy_dtoh(C，C_gpu)print(Result:\n ，C)print(耗时% s"%(时间。time()-start))结果：[[219468。214786.230702.245646.236251.250875.] [227736.221473.224950.247127.247688.246141.] [223986.193710.221462.231594.245623.234833.].[249705.238607.253167.253975.284177.246474.] [207058.212837.217770.219180.261689.224773.] [213341.231024.251518.229844.268992.245802.]]耗时0.002018451690673828 3.使用Pybind11调用C写的库达代码Pybind11是一个轻量级的C库，用于将你的C代码暴露给计算机编程语言调用（反之也可，但主要还是前者)。Pybind11借鉴了Boost:Python库的设计，但使用了更为简洁的实现方式，使用了大量C 11的新特性，更易于使用。

　　pybind安装参考：pybind11使用-简书

　　演示参考：使用大蟒调用pybind11封装的cuda C动态链接库-简书

　　C代码：

　　写出库达核函数，例子中用到了数组在c中的表示。调用主要用到PYBIND11 _MODULE(例如，m)该方法，参数中第一个为导出的包名，第二个参数为模块实例对象，pybind11的模块实例对象提供了定义()函数。

　　#包括pybind 11/py bind 11。h #包含iostream #包含stdio。h #包含stdlib。h #包含cuda _ runtime。h #包含设备启动参数。h #包括pybind 11/numpy。hnamespace py=py bind 11_ _ global _ _ void matrix _ GL bal _ mul(float * arr _ a，float* arr_b，float* res，int a_shape_1){//a_shape_0，a_shape_1分别为第一个数组的行数和列数，b_shape_1为第二个数组的列数int x=threadidx。x块idx。x *块尺寸。x；//定位到表示留数的列索引int y=threadidx。y块idx。y *块变暗。y；//定位到表示留数的行索引float Pvalue=0；for(int k=0；k a _ shape _ 1；k)Pvalue=arr _ a[y * a _ shape _ 1k]* arr _ b[k * a _ shape _ 1x]；RES[y * a _ shape _ 1x]=Pvalue；} py:array _ t float NP _ multiply(py:array _ t float arr _ a，py:array_tfloat arr_b){//可通过此函数传入大蟒中的numpy.ndarray数据，在C中表现为py:数组_tT格式。py:buffer _ info bufA=arr _ a . request()，bufB=arr _ b . request()；//请求方法活得对py:数组_tT的绑定，包括维度、数据指针、大小、形状等参数const int a _ shape _ 0=bufa。shape[0]，a_shape_1=bufA.shape[1]，b _ shape _ 1=bufb。形状[1]；//分别是A的行数、列数、B的列数STD:cout a _ shape _ 0 a _ shape _ 1 b _ shape _ 1 STD:endl；auto result=py:array _ t float(a _ shape _ 0 * b _ shape _ 1)；result.resize({ a_shape_0，b _ shape _ 1 })；py:buff _ info buf result=结果。请求()；float *ptrA=(float *)bufA.ptr，*ptrB=(float *)bufB.ptr，* ptr result=(float *)buf result。ptr//获得数据指针float *d_a，*d_b，* d _ rescudaMalloc((void **)d_a，a _ shape _ 0 * a _ shape _ 1 * sizeof(float))；cudaMalloc((void **)d_b，a _ shape _ 1 * b _ shape _ 1 * sizeof(float))；cudaMalloc((void **)d_res，a _ shape _ 0 * b _ shape _ 1 * sizeof(float))；cudamemacpy(d _ a，ptrA，a _ shape _ 0 * a _ shape _ 1 * sizeof(float)，cudamemacphyhosttodevice)；cudamemacpy(d _ b，ptrB，a _ shape _ 1 * b _ shape _ 1 * sizeof(float)，cudamemacphyhosttodevice)；//const expr const int TP=10；//dim3块(TP，TP)；//dim3网格(a_shape_0/TP，b _ shape _ 1/TP)；constexpr const int TP=16dim3块(TP，TP)；dim3网格((a_shape_0 TP - 1)/TP，(b _ shape _ 1 TP-1)/TP)；matrix_glbal_mul grid，block (d_a，d_b，d_res，a _ shape _ 1)；cudamemacpy(ptr result，d_res，a _ shape _ 0 * b _ shape _ 1 * sizeof(float)，cudamemacpydevicetohost)；cuda免费(d _ a)；cuda免费(d _ b)；cuda免费(d _ RES)；返回结果；}PYBIND11_MODULE(example，m) {m.doc()=pybind11示例模块；m.def(matrix_glbal_mul ，matrix_glbal_mul， Multuply tow arrays )；m.def(np_multiply ，np_multiply，乘两个数组)；} python调用代码：

　　导入numpy作为NP导入演示。例如导入时间arr _ a=NP。随机的。兰特(1，100，10000英镑).shape((100，100))arr_b=np.random.randint(1，100，10000).形状((100，100))开始=时间。time()RES=示例。NP _ multiply(arr _ a，arr_b)print(Result:\n ，res)print(耗时% s"%(时间。time()-start))结果：[[279828。259870.266260.254709.227848.250391.] [237871.228993.244860.235741.207431.227064.] [268107.233281.259488.252508.220149.248723.].[276107.237983.269437.253083.233473.255776.] [251326.214743.248869.231401.200128.224235.] [300701.283541.292042.289940.255317.274050.]]耗时0.14971685409545898

郑重声明：本文由网友发布，不代表盛行IT的观点，版权归原作者所有，仅为传播更多信息之目的，如有侵权请联系，我们将第一时间修改或删除，多谢。

相关文章阅读