Actual source code: vecpthread.c
petsc-3.3-p7 2013-05-11
1: /*
2: Implements the sequential pthread based vectors.
3: */
4: #include <petscconf.h>
5: #include <../src/vec/vec/impls/dvecimpl.h> /*I "petscvec.h" I*/
6: #include <../src/sys/objects/pthread/pthreadimpl.h>
7: #include <../src/vec/vec/impls/seq/seqpthread/vecpthreadimpl.h>
8: #include <petscblaslapack.h>
9: #include <petsc-private/petscaxpy.h>
11: PetscInt vecs_created=0;
12: Vec_KernelData *vec_kerneldatap;
13: Vec_KernelData **vec_pdata;
17: /*@
18: VecGetThreadOwnershipRange - Returns the range of indices owned by
19: this thread, assuming that the vectors are laid out with the first
20: thread operating on the first n1 elements, next n2 elements by second,
21: etc.
23: Not thread collective
24: Input Parameter:
25: + X - the vector
26: - thread_id - Thread number
28: Output Parameters:
29: + start - the first thread local element index, pass in PETSC_NULL if not interested
30: - end - one more than the last thread local element index, pass in PETSC_NULL if not interested
32: Level: beginner
34: Concepts: vector^ownership of elements on threads
35: @*/
36: PetscErrorCode VecGetThreadOwnershipRange(Vec X,PetscInt thread_id,PetscInt *start,PetscInt *end)
37: {
38: PetscThreadsLayout tmap=X->map->tmap;
39: PetscInt *trstarts=tmap->trstarts;
42: if(start) *start = trstarts[thread_id];
43: if(end) *end = trstarts[thread_id+1];
44: return(0);
45: }
47: PetscErrorCode VecDot_Kernel(void *arg)
48: {
49: PetscErrorCode ierr;
50: Vec_KernelData *data = (Vec_KernelData*)arg;
51: Vec X=data->X;
52: PetscInt thread_id=data->thread_id;
53: const PetscScalar *x, *y;
54: PetscInt n,start,end;
55: PetscBLASInt one = 1, bn,bstart;
57: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
58: x = (const PetscScalar*)data->x;
59: y = (const PetscScalar*)data->y;
60: n = end-start;
61: bn = PetscBLASIntCast(n);
62: bstart = PetscBLASIntCast(start);
63: /* arguments ya, xa are reversed because BLAS complex conjugates the first argument, PETSc the second */
64: data->result = BLASdot_(&bn,y+bstart,&one,x+bstart,&one);
65: return(0);
66: }
70: PetscErrorCode VecDot_SeqPThread(Vec xin,Vec yin,PetscScalar *z)
71: {
72: PetscErrorCode ierr;
73: PetscThreadsLayout tmap=xin->map->tmap;
74: PetscScalar *ya,*xa;
75: PetscInt i;
79: VecGetArray(xin,&xa);
80: VecGetArray(yin,&ya);
82: for (i=0; i<tmap->nthreads; i++) {
83: vec_kerneldatap[i].X = xin;
84: vec_kerneldatap[i].thread_id = i;
85: vec_kerneldatap[i].x = xa;
86: vec_kerneldatap[i].y = ya;
87: vec_pdata[i] = &vec_kerneldatap[i];
88: }
90: PetscThreadsRunKernel(VecDot_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
92: /* gather result */
93: *z = vec_kerneldatap[0].result;
94: for(i=1; i<tmap->nthreads; i++) {
95: *z += vec_kerneldatap[i].result;
96: }
98: VecRestoreArray(xin,&xa);
99: VecRestoreArray(yin,&ya);
101: if (xin->map->n > 0) {
102: PetscLogFlops(2.0*xin->map->n-1+tmap->nthreads-1);
103: }
104: return(0);
105: }
107: PetscErrorCode VecTDot_Kernel(void *arg)
108: {
110: Vec_KernelData *data = (Vec_KernelData*)arg;
111: Vec X = data->X;
112: PetscInt thread_id=data->thread_id;
113: const PetscScalar *x, *y;
114: PetscInt n,start,end;
115: PetscBLASInt one = 1, bn,bstart;
117: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
118: x = (const PetscScalar*)data->x;
119: y = (const PetscScalar*)data->y;
120: n = end-start;
121: bn = PetscBLASIntCast(n);
122: bstart = PetscBLASIntCast(start);
123: data->result = BLASdotu_(&bn,x+bstart,&one,y+bstart,&one);
124: return(0);
125: }
129: PetscErrorCode VecTDot_SeqPThread(Vec xin,Vec yin,PetscScalar *z)
130: {
131: PetscErrorCode ierr;
132: PetscThreadsLayout tmap = xin->map->tmap;
133: PetscScalar *ya,*xa;
134: PetscInt i;
138: VecGetArray(xin,&xa);
139: VecGetArray(yin,&ya);
141: for (i=0; i<tmap->nthreads; i++) {
142: vec_kerneldatap[i].X = xin;
143: vec_kerneldatap[i].thread_id = i;
144: vec_kerneldatap[i].x = xa;
145: vec_kerneldatap[i].y = ya;
146: vec_pdata[i] = &vec_kerneldatap[i];
147: }
149: PetscThreadsRunKernel(VecTDot_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
151: /* gather result */
152: *z = vec_kerneldatap[0].result;
153: for(i=1; i<tmap->nthreads; i++) {
154: *z += vec_kerneldatap[i].result;
155: }
157: VecRestoreArray(xin,&xa);
158: VecRestoreArray(yin,&ya);
160: if (xin->map->n > 0) {
161: PetscLogFlops(2.0*xin->map->n-1+tmap->nthreads-1);
162: }
163: PetscFunctionReturn(ierr);
164: }
166: PetscErrorCode VecScale_Kernel(void *arg)
167: {
169: Vec_KernelData *data = (Vec_KernelData*)arg;
170: Vec X=data->X;
171: PetscInt thread_id=data->thread_id;
172: PetscScalar a,*x;
173: PetscBLASInt one = 1, bn,bstart;
174: PetscInt n,start,end;
176: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
177: x = data->x;
178: a = data->alpha;
179: n = end-start;
180: bn = PetscBLASIntCast(n);
181: bstart = PetscBLASIntCast(start);
182: BLASscal_(&bn,&a,x+bstart,&one);
183: return(0);
184: }
186: PetscErrorCode VecSet_SeqPThread(Vec,PetscScalar);
190: PetscErrorCode VecScale_SeqPThread(Vec xin, PetscScalar alpha)
191: {
192: PetscErrorCode ierr;
193: PetscThreadsLayout tmap=xin->map->tmap;
197: if (alpha == 0.0) {
198: VecSet_SeqPThread(xin,alpha);
199: } else if (alpha != 1.0) {
200: PetscScalar *xa;
201: PetscInt i;
203: VecGetArray(xin,&xa);
204: for (i=0; i< tmap->nthreads; i++) {
205: vec_kerneldatap[i].X = xin;
206: vec_kerneldatap[i].thread_id = i;
207: vec_kerneldatap[i].x = xa;
208: vec_kerneldatap[i].alpha = alpha;
209: vec_pdata[i] = &vec_kerneldatap[i];
210: }
211: PetscThreadsRunKernel(VecScale_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
213: VecRestoreArray(xin,&xa);
214: }
215: PetscLogFlops(xin->map->n);
216: PetscFunctionReturn(ierr);
217: }
219: PetscErrorCode VecAXPY_Kernel(void *arg)
220: {
221: PetscErrorCode ierr;
222: Vec_KernelData *data = (Vec_KernelData*)arg;
223: Vec X=data->X;
224: PetscInt thread_id=data->thread_id;
225: PetscScalar a,*y;
226: const PetscScalar *x;
227: PetscBLASInt one = 1, bn,bstart;
228: PetscInt n,start,end;
230: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
231: x = (const PetscScalar*)data->x;
232: y = data->y;
233: a = data->alpha;
234: n = end-start;
235: bn = PetscBLASIntCast(n);
236: bstart = PetscBLASIntCast(start);
237: BLASaxpy_(&bn,&a,x+bstart,&one,y+bstart,&one);
238: return(0);
239: }
243: PetscErrorCode VecAXPY_SeqPThread(Vec yin,PetscScalar alpha,Vec xin)
244: {
245: PetscErrorCode ierr;
246: PetscThreadsLayout tmap=yin->map->tmap;
247: PetscScalar *ya,*xa;
248: PetscInt i;
252: if (alpha != 0.0) {
253: VecGetArray(xin,&xa);
254: VecGetArray(yin,&ya);
255: for (i=0; i<tmap->nthreads; i++) {
256: vec_kerneldatap[i].X = yin;
257: vec_kerneldatap[i].thread_id = i;
258: vec_kerneldatap[i].x = xa;
259: vec_kerneldatap[i].y = ya;
260: vec_kerneldatap[i].alpha = alpha;
261: vec_pdata[i] = &vec_kerneldatap[i];
262: }
263: PetscThreadsRunKernel(VecAXPY_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
264: VecRestoreArray(xin,&xa);
265: VecRestoreArray(yin,&ya);
266: PetscLogFlops(2.0*yin->map->n);
267: }
268: return(0);
269: }
271: PetscErrorCode VecAYPX_Kernel(void *arg)
272: {
273: PetscErrorCode ierr;
274: Vec_KernelData *data = (Vec_KernelData*)arg;
275: Vec X=data->X;
276: PetscInt thread_id=data->thread_id;
277: PetscScalar a,*y;
278: const PetscScalar *x;
279: #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
280: PetscInt n;
281: #endif
282: PetscInt i,start,end;
284: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
285: x = (const PetscScalar*)data->x;
286: y = data->y;
287: a = data->alpha;
289: #if defined(PETSC_USE_FORTRAN_KERNEL_AYPX)
290: n = end-start;
291: fortranaypx_(&n,&a,x+start,y+start);
292: #else
293: if(a==-1.0) {
294: for (i=start; i<end; i++) {
295: y[i] = x[i] - y[i];
296: }
297: }
298: else {
299: for (i=start; i<end; i++) {
300: y[i] = x[i] + a*y[i];
301: }
302: }
303: #endif
304: return(0);
305: }
309: PetscErrorCode VecAYPX_SeqPThread(Vec yin,PetscScalar alpha,Vec xin)
310: {
311: PetscErrorCode ierr;
312: PetscThreadsLayout tmap=yin->map->tmap;
313: PetscScalar *ya,*xa;
314: PetscInt i;
318: if (alpha == 0.0) {
319: VecCopy(xin,yin);
320: }
321: else if (alpha == 1.0) {
322: VecAXPY_SeqPThread(yin,alpha,xin);
323: }
324: else {
325: VecGetArray(xin,&xa);
326: VecGetArray(yin,&ya);
327: for (i=0; i<tmap->nthreads; i++) {
328: vec_kerneldatap[i].X = yin;
329: vec_kerneldatap[i].thread_id = i;
330: vec_kerneldatap[i].x = xa;
331: vec_kerneldatap[i].y = ya;
332: vec_kerneldatap[i].alpha = alpha;
333: vec_pdata[i] = &vec_kerneldatap[i];
334: }
335: PetscThreadsRunKernel(VecAYPX_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
336: VecRestoreArray(xin,&xa);
337: VecRestoreArray(yin,&ya);
338: if(alpha==-1.0) {
339: PetscLogFlops(1.0*xin->map->n);
340: }
341: else {
342: PetscLogFlops(2.0*xin->map->n);
343: }
344: }
345: return(0);
346: }
348: PetscErrorCode VecAX_Kernel(void *arg)
349: {
350: PetscErrorCode ierr;
351: Vec_KernelData *data = (Vec_KernelData*)arg;
352: Vec X=data->X;
353: PetscInt thread_id=data->thread_id;
354: PetscScalar a,*y;
355: const PetscScalar *x;
356: PetscInt i,start,end;
358: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
359: x = (const PetscScalar*)data->x;
360: y = data->y;
361: a = data->alpha;
362: for(i=start;i < end; i++) y[i] = a*x[i];
363: return(0);
364: }
366: PetscErrorCode VecAXPBY_Kernel(void *arg)
367: {
368: PetscErrorCode ierr;
369: Vec_KernelData *data = (Vec_KernelData*)arg;
370: Vec X=data->X;
371: PetscInt thread_id=data->thread_id;
372: PetscScalar a,b,*y;
373: const PetscScalar *x;
374: PetscInt i,start,end;
376: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
377: x = (const PetscScalar*)data->x;
378: y = data->y;
379: a = data->alpha;
380: b = data->beta;
381: for(i=start;i < end; i++) y[i] = a*x[i] + b*y[i];
382: return(0);
383: }
387: PetscErrorCode VecAXPBY_SeqPThread(Vec yin,PetscScalar alpha,PetscScalar beta,Vec xin)
388: {
389: PetscErrorCode ierr;
390: PetscThreadsLayout tmap=yin->map->tmap;
391: PetscScalar *ya,*xa;
392: PetscInt i=0;
396: if(alpha == 0.0 && beta == 1.0) {
397: return(0);
398: }
400: if(alpha == (PetscScalar)0.0) {
401: VecScale_SeqPThread(yin,beta);
402: } else if (beta == (PetscScalar)1.0) {
403: VecAXPY_SeqPThread(yin,alpha,xin);
404: } else if (alpha == (PetscScalar)1.0) {
405: VecAYPX_SeqPThread(yin,beta,xin);
406: } else if (beta == (PetscScalar)0.0) {
407: VecGetArray(xin,&xa);
408: VecGetArray(yin,&ya);
409: for (i=0; i<tmap->nthreads; i++) {
410: vec_kerneldatap[i].X = yin;
411: vec_kerneldatap[i].thread_id=i;
412: vec_kerneldatap[i].x = xa;
413: vec_kerneldatap[i].y = ya;
414: vec_kerneldatap[i].alpha = alpha;
415: vec_pdata[i] = &vec_kerneldatap[i];
416: }
417:
418: PetscThreadsRunKernel(VecAX_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
419: PetscLogFlops(xin->map->n);
420:
421: VecRestoreArray(xin,&xa);
422: VecRestoreArray(yin,&ya);
423:
424: } else {
425: VecGetArray(xin,&xa);
426: VecGetArray(yin,&ya);
427: for (i=0; i<tmap->nthreads; i++) {
428: vec_kerneldatap[i].X = yin;
429: vec_kerneldatap[i].thread_id = i;
430: vec_kerneldatap[i].x = xa;
431: vec_kerneldatap[i].y = ya;
432: vec_kerneldatap[i].alpha = alpha;
433: vec_kerneldatap[i].beta = beta;
434: vec_pdata[i] = &vec_kerneldatap[i];
435: }
436:
437: PetscThreadsRunKernel(VecAXPBY_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
438: PetscLogFlops(3.0*xin->map->n);
439:
440: VecRestoreArray(xin,&xa);
441: VecRestoreArray(yin,&ya);
442:
443: }
444: return(0);
445: }
447: PetscErrorCode VecWAXPY_Kernel(void *arg)
448: {
449: Vec_KernelData *data = (Vec_KernelData*)arg;
450: Vec X=data->X;
451: PetscInt thread_id=data->thread_id;
452: PetscScalar a,*ww;
453: const PetscScalar *xx,*yy;
454: PetscInt n;
455: PetscInt i,start,end;
456: PetscErrorCode ierr;
458: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
459: ww = data->w;
460: xx = (const PetscScalar*)data->x;
461: yy = (const PetscScalar*)data->y;
462: a = data->alpha;
463: n = end-start;
464: #if defined(PETSC_USE_FORTRAN_KERNEL_WAXPY)
465: fortranwaxpy_(&n,&a,xx,yy,ww);
466: #else
467: if (a == 0.0) {
468: PetscMemcpy(ww+start,yy+start,n*sizeof(PetscScalar));
469: }
470: else if(a==-1.0) {
471: for (i=start; i<end; i++) {
472: ww[i] = yy[i] - xx[i];
473: }
474: }
475: else if(a==1.0) {
476: for (i=start; i<end; i++) {
477: ww[i] = yy[i] + xx[i];
478: }
479: }
480: else {
481: for (i=start; i<end; i++) {
482: ww[i] = a*xx[i] + yy[i];
483: }
484: }
485: #endif
486: return(0);
487: }
491: PetscErrorCode VecWAXPY_SeqPThread(Vec win, PetscScalar alpha,Vec xin,Vec yin)
492: {
493: PetscErrorCode ierr;
494: PetscThreadsLayout tmap=win->map->tmap;
495: PetscScalar *ya,*xa,*wa;
496: PetscInt i;
500: VecGetArray(xin,&xa);
501: VecGetArray(yin,&ya);
502: VecGetArray(win,&wa);
504: for (i=0; i<tmap->nthreads; i++) {
505: vec_kerneldatap[i].X = win;
506: vec_kerneldatap[i].thread_id = i;
507: vec_kerneldatap[i].x = xa;
508: vec_kerneldatap[i].y = ya;
509: vec_kerneldatap[i].w = wa;
510: vec_kerneldatap[i].alpha = alpha;
511: vec_pdata[i] = &vec_kerneldatap[i];
512: }
513: PetscThreadsRunKernel(VecWAXPY_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
515: if (alpha == 1.0 || alpha == -1.0) {
516: PetscLogFlops(1.0*win->map->n);
517: }
518: else {
519: PetscLogFlops(2.0*win->map->n);
520: }
522: VecRestoreArray(xin,&xa);
523: VecRestoreArray(yin,&ya);
524: VecRestoreArray(win,&wa);
525: return(0);
526: }
528: PetscErrorCode VecNorm_Kernel(void *arg)
529: {
530: Vec_KernelData *data = (Vec_KernelData*)arg;
532: Vec X=data->X;
533: PetscInt thread_id=data->thread_id;
534: const PetscScalar *x;
535: NormType type;
536: PetscInt i,n,start,end;
538: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
539: x = (const PetscScalar*)data->x;
540: type = data->typeUse;
541: n = end-start;
542: data->result = 0.0;
543: if(type==NORM_1) {
544: PetscBLASInt one = 1, bn = PetscBLASIntCast(n),bstart=PetscBLASIntCast(start);
545: data->result = BLASasum_(&bn,x+bstart,&one);
546: }
547: else if(type==NORM_INFINITY) {
548: PetscReal maxv = 0.0,tmp;
549: for(i=start; i<end; i++) {
550: tmp = PetscAbsScalar(x[i]);
551: if(tmp>maxv) {
552: maxv = tmp;
553: }
554: }
555: data->result = maxv;
556: } else {
557: PetscBLASInt one = 1, bn = PetscBLASIntCast(n),bstart=PetscBLASIntCast(start);
558: data->result = BLASdot_(&bn,x+bstart,&one,x+bstart,&one);
559: }
560: return(0);
561: }
565: PetscErrorCode VecNorm_SeqPThread(Vec xin,NormType type,PetscReal* z)
566: {
568: PetscErrorCode ierr;
569: PetscThreadsLayout tmap=xin->map->tmap;
570: PetscScalar *xa;
574: if(type == NORM_1_AND_2) {
575: VecNorm_SeqPThread(xin,NORM_1,z);
576: VecNorm_SeqPThread(xin,NORM_2,z+1);
577: }
578: else {
579: PetscInt i;
581: VecGetArray(xin,&xa);
583: for (i=0; i<tmap->nthreads; i++) {
584: vec_kerneldatap[i].X = xin;
585: vec_kerneldatap[i].thread_id = i;
586: vec_kerneldatap[i].x = xa;
587: vec_kerneldatap[i].typeUse = type;
588: vec_pdata[i] = &vec_kerneldatap[i];
589: }
590: PetscThreadsRunKernel(VecNorm_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
591: /* collect results */
592: *z = (PetscReal)vec_kerneldatap[0].result;
593: if(type == NORM_1) {
594: for(i=1; i<tmap->nthreads; i++) {
595: *z += (PetscReal)vec_kerneldatap[i].result;
596: }
597: PetscLogFlops(PetscMax(xin->map->n-1.0+tmap->nthreads-1,0.0));
598: }
599: else if(type == NORM_2 || type == NORM_FROBENIUS) {
600: *z = (PetscReal)vec_kerneldatap[0].result;
601: for(i=1; i<tmap->nthreads; i++) {
602: *z += (PetscReal)vec_kerneldatap[i].result;
603: }
604: *z = PetscSqrtReal(*z);
605: PetscLogFlops(PetscMax(2.0*xin->map->n-1+tmap->nthreads-1,0.0));
606: }
607: else {
608: PetscReal maxv = 0.0,tmp;
609: for(i=0; i<tmap->nthreads; i++) {
610: tmp = (PetscReal)vec_kerneldatap[i].result;
611: if(tmp>maxv) {
612: maxv = tmp;
613: }
614: }
615: *z = maxv;
616: }
617: VecRestoreArray(xin,&xa);
618: }
619: return(0);
620: }
622: PetscErrorCode VecMDot_Kernel4(void* arg)
623: {
624: Vec_KernelData *data = (Vec_KernelData*)arg;
625: PetscErrorCode ierr;
626: Vec X=data->X;
627: PetscInt thread_id=data->thread_id;
628: PetscInt start,end;
629: const PetscScalar *x = (const PetscScalar*)data->x;
630: const PetscScalar *y0 = (const PetscScalar*)data->y0;
631: const PetscScalar *y1 = (const PetscScalar*)data->y1;
632: const PetscScalar *y2 = (const PetscScalar*)data->y2;
633: const PetscScalar *y3 = (const PetscScalar*)data->y3;
634: PetscInt i;
635: PetscScalar sum0,sum1,sum2,sum3;
637: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
639: sum0 = sum1 = sum2 = sum3 = 0.0;
640: for(i=start;i<end;i++) {
641: sum0 += (x[i])*PetscConj(y0[i]);
642: sum1 += (x[i])*PetscConj(y1[i]);
643: sum2 += (x[i])*PetscConj(y2[i]);
644: sum3 += (x[i])*PetscConj(y3[i]);
645: }
646: data->result0 = sum0; data->result1 = sum1; data->result2 = sum2; data->result3 = sum3;
647: return(0);
648: }
650: PetscErrorCode VecMDot_Kernel3(void* arg)
651: {
652: Vec_KernelData *data = (Vec_KernelData*)arg;
653: PetscErrorCode ierr;
654: Vec X=data->X;
655: PetscInt thread_id=data->thread_id;
656: PetscInt start,end;
657: const PetscScalar *x = (const PetscScalar*)data->x;
658: const PetscScalar *y0 = (const PetscScalar*)data->y0;
659: const PetscScalar *y1 = (const PetscScalar*)data->y1;
660: const PetscScalar *y2 = (const PetscScalar*)data->y2;
661: PetscInt i;
662: PetscScalar sum0,sum1,sum2;
663:
664: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
665: sum0 = sum1 = sum2 = 0.0;
666: for(i=start;i<end;i++) {
667: sum0 += (x[i])*PetscConj(y0[i]);
668: sum1 += (x[i])*PetscConj(y1[i]);
669: sum2 += (x[i])*PetscConj(y2[i]);
670: }
671: data->result0 = sum0; data->result1 = sum1; data->result2 = sum2;
672: return(0);
673: }
675: PetscErrorCode VecMDot_Kernel2(void* arg)
676: {
677: Vec_KernelData *data = (Vec_KernelData*)arg;
678: PetscErrorCode ierr;
679: Vec X=data->X;
680: PetscInt thread_id=data->thread_id;
681: PetscInt start,end;
682: const PetscScalar *x = (const PetscScalar*)data->x;
683: const PetscScalar *y0 = (const PetscScalar*)data->y0;
684: const PetscScalar *y1 = (const PetscScalar*)data->y1;
685: PetscInt i;
686: PetscScalar sum0,sum1;
688: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
689: sum0 = sum1 = 0.0;
690: for(i=start;i<end;i++) {
691: sum0 += (x[i])*PetscConj(y0[i]);
692: sum1 += (x[i])*PetscConj(y1[i]);
693: }
694: data->result0 = sum0; data->result1 = sum1;
695: return(0);
696: }
700: PetscErrorCode VecMDot_SeqPThread(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
701: {
702: PetscErrorCode ierr;
703: PetscThreadsLayout tmap=xin->map->tmap;
704: Vec *yy = (Vec*)yin;
705: PetscScalar *xa,*y0,*y1,*y2,*y3;
706: PetscInt i,j,j_rem;
709: VecGetArray(xin,&xa);
710: switch(j_rem = nv&0x3) {
711: case 3:
712: VecGetArray(yy[0],&y0);
713: VecGetArray(yy[1],&y1);
714: VecGetArray(yy[2],&y2);
716: for(i=0;i<tmap->nthreads;i++) {
717: vec_kerneldatap[i].X = xin;
718: vec_kerneldatap[i].thread_id = i;
719: vec_kerneldatap[i].x = xa;
720: vec_kerneldatap[i].y0 = y0;
721: vec_kerneldatap[i].y1 = y1;
722: vec_kerneldatap[i].y2 = y2;
723: vec_pdata[i] = &vec_kerneldatap[i];
724: }
725: PetscThreadsRunKernel(VecMDot_Kernel3,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
727: VecRestoreArray(yy[0],&y0);
728: VecRestoreArray(yy[1],&y1);
729: VecRestoreArray(yy[2],&y2);
731: z[0] = vec_kerneldatap[0].result0;
732: for(j=1;j<tmap->nthreads;j++) {
733: z[0] += vec_kerneldatap[j].result0;
734: }
735: z[1] = vec_kerneldatap[0].result1;
736: for(j=1;j<tmap->nthreads;j++) {
737: z[1] += vec_kerneldatap[j].result1;
738: }
739: z[2] = vec_kerneldatap[0].result2;
740: for(j=1;j<tmap->nthreads;j++) {
741: z[2] += vec_kerneldatap[j].result2;
742: }
743: yy += 3;
744: z += 3;
745: break;
746: case 2:
747: VecGetArray(yy[0],&y0);
748: VecGetArray(yy[1],&y1);
750: for(i=0;i<tmap->nthreads;i++) {
751: vec_kerneldatap[i].X = xin;
752: vec_kerneldatap[i].thread_id = i;
753: vec_kerneldatap[i].x = xa;
754: vec_kerneldatap[i].y0 = y0;
755: vec_kerneldatap[i].y1 = y1;
756: vec_pdata[i] = &vec_kerneldatap[i];
757: }
758: PetscThreadsRunKernel(VecMDot_Kernel2,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
760: VecRestoreArray(yy[0],&y0);
761: VecRestoreArray(yy[1],&y1);
763: z[0] = vec_kerneldatap[0].result0;
764: for(j=1;j<tmap->nthreads;j++) {
765: z[0] += vec_kerneldatap[j].result0;
766: }
767: z[1] = vec_kerneldatap[0].result1;
768: for(j=1;j<tmap->nthreads;j++) {
769: z[1] += vec_kerneldatap[j].result1;
770: }
771: yy += 2; z += 2;
772: break;
773: case 1:
774: VecGetArray(yy[0],&y0);
776: for(i=0;i<tmap->nthreads;i++) {
777: vec_kerneldatap[i].X = xin;
778: vec_kerneldatap[i].thread_id = i;
779: vec_kerneldatap[i].x = xa;
780: vec_kerneldatap[i].y = y0;
781: vec_pdata[i] = &vec_kerneldatap[i];
782: }
783: PetscThreadsRunKernel(VecDot_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
784:
785: VecRestoreArray(yy[0],&y0);
787: z[0] = vec_kerneldatap[0].result;
788: for(j=1;j<tmap->nthreads;j++) {
789: z[0] += vec_kerneldatap[j].result;
790: }
791: yy++; z++;
792: break;
793: }
794: for(j=j_rem;j<nv;j+=4) {
795: VecGetArray(yy[0],&y0);
796: VecGetArray(yy[1],&y1);
797: VecGetArray(yy[2],&y2);
798: VecGetArray(yy[3],&y3);
800: for(i=0;i<tmap->nthreads;i++) {
801: vec_kerneldatap[i].X = xin;
802: vec_kerneldatap[i].thread_id = i;
803: vec_kerneldatap[i].x = xa;
804: vec_kerneldatap[i].y0 = y0;
805: vec_kerneldatap[i].y1 = y1;
806: vec_kerneldatap[i].y2 = y2;
807: vec_kerneldatap[i].y3 = y3;
808: vec_pdata[i] = &vec_kerneldatap[i];
809: }
810: PetscThreadsRunKernel(VecMDot_Kernel4,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
812: VecRestoreArray(yy[0],&y0);
813: VecRestoreArray(yy[1],&y1);
814: VecRestoreArray(yy[2],&y2);
815: VecRestoreArray(yy[3],&y3);
817: z[0] = vec_kerneldatap[0].result0;
818: for(i=1;i<tmap->nthreads;i++) {
819: z[0] += vec_kerneldatap[i].result0;
820: }
821: z[1] = vec_kerneldatap[0].result1;
822: for(i=1;i<tmap->nthreads;i++) {
823: z[1] += vec_kerneldatap[i].result1;
824: }
825: z[2] = vec_kerneldatap[0].result2;
826: for(i=1;i<tmap->nthreads;i++) {
827: z[2] += vec_kerneldatap[i].result2;
828: }
829: z[3] = vec_kerneldatap[0].result3;
830: for(i=1;i<tmap->nthreads;i++) {
831: z[3] += vec_kerneldatap[i].result3;
832: }
833: yy += 4;
834: z += 4;
835: }
836: VecRestoreArray(xin,&xa);
838: PetscLogFlops(PetscMax(nv*(2.0*xin->map->n-1+tmap->nthreads-1),0.0));
839: return(0);
840: }
844: PetscErrorCode VecMTDot_SeqPThread(Vec xin,PetscInt nv,const Vec yin[],PetscScalar *z)
845: {
846: PetscErrorCode ierr=0;
847: PetscInt j;
851: for(j=0;j<nv;j++) {
852: VecTDot_SeqPThread(xin,yin[j],&z[j]);
853: }
855: return(0);
856: }
859: PetscErrorCode VecMax_Kernel(void *arg)
860: {
861: Vec_KernelData *data = (Vec_KernelData*)arg;
862: PetscErrorCode ierr;
863: Vec X=data->X;
864: PetscInt thread_id=data->thread_id;
865: PetscInt start,end;
866: const PetscScalar *xx = (const PetscScalar*)data->x;
867: PetscInt i,j;
868: PetscReal lmax,tmp;
870: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
871: #if defined(PETSC_USE_COMPLEX)
872: lmax = PetscRealPart(xx[start]); j = 0;
873: #else
874: lmax = xx[start]; j = 0;
875: #endif
876: for (i=start+1; i<end; i++) {
877: #if defined(PETSC_USE_COMPLEX)
878: if ((tmp = PetscRealPart(xx[i])) > lmax) { j = i; lmax = tmp;}
879: #else
880: if ((tmp = xx[i]) > lmax) { j = i; lmax = tmp; }
881: #endif
882: }
884: data->localmax = lmax;
885: data->localind = j;
886: return(0);
887: }
891: PetscErrorCode VecMax_SeqPThread(Vec xin,PetscInt* idx,PetscReal * z)
892: {
893: PetscErrorCode ierr;
894: PetscThreadsLayout tmap=xin->map->tmap;
895: PetscInt i,j=0;
896: PetscScalar *xa;
897: PetscReal max;
901: VecGetArray(xin,&xa);
902: if (!xin->map->n) {
903: max = PETSC_MIN_REAL;
904: j = -1;
905: } else {
906: for (i=0; i<tmap->nthreads; i++) {
907: vec_kerneldatap[i].X = xin;
908: vec_kerneldatap[i].thread_id = i;
909: vec_kerneldatap[i].x = xa;
910: vec_pdata[i] = &vec_kerneldatap[i];
911: }
912: PetscThreadsRunKernel(VecMax_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
913: /* collect results, determine global max, global index */
914: max = vec_kerneldatap[0].localmax;
915: j = vec_kerneldatap[0].localind;
916: for(i=1; i<tmap->nthreads; i++) {
917: if(vec_kerneldatap[i].localmax > max) {
918: max = vec_kerneldatap[i].localmax;
919: j = vec_kerneldatap[i].localind;
920: }
921: }
922: }
923: *z = max;
924: if (idx) *idx = j;
925: VecRestoreArray(xin,&xa);
926: return(0);
927: }
929: PetscErrorCode VecMin_Kernel(void *arg)
930: {
931: Vec_KernelData *data = (Vec_KernelData*)arg;
932: PetscErrorCode ierr;
933: Vec X=data->X;
934: PetscInt thread_id=data->thread_id;
935: PetscInt start,end;
936: const PetscScalar *xx = (const PetscScalar*)data->x;
937: PetscInt i,j;
938: PetscReal lmin,tmp;
940: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
941: #if defined(PETSC_USE_COMPLEX)
942: lmin = PetscRealPart(xx[start]); j = 0;
943: #else
944: lmin = xx[start]; j = 0;
945: #endif
946: for (i=start+1; i<end; i++) {
947: #if defined(PETSC_USE_COMPLEX)
948: if ((tmp = PetscRealPart(xx[i])) < lmin) { j = i; lmin = tmp;}
949: #else
950: if ((tmp = xx[i]) < lmin) { j = i; lmin = tmp; }
951: #endif
952: }
954: data->localmin = lmin;
955: data->localind = j;
956: return(0);
957: }
961: PetscErrorCode VecMin_SeqPThread(Vec xin,PetscInt* idx,PetscReal * z)
962: {
963: PetscErrorCode ierr;
964: PetscThreadsLayout tmap=xin->map->tmap;
965: PetscInt i,j=0;
966: PetscScalar *xa;
967: PetscReal min;
971: VecGetArray(xin,&xa);
972: if (!xin->map->n) {
973: min = PETSC_MAX_REAL;
974: j = -1;
975: } else {
976: for (i=0; i<tmap->nthreads; i++) {
977: vec_kerneldatap[i].X = xin;
978: vec_kerneldatap[i].thread_id = i;
979: vec_kerneldatap[i].x = xa;
980: vec_pdata[i] = &vec_kerneldatap[i];
981: }
983: PetscThreadsRunKernel(VecMin_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
984: /* collect results, determine global max, global index */
985: min = vec_kerneldatap[0].localmin;
986: j = vec_kerneldatap[0].localind;
987: for(i=1; i<tmap->nthreads; i++) {
988: if(vec_kerneldatap[i].localmin < min) {
989: min = vec_kerneldatap[i].localmin;
990: j = vec_kerneldatap[i].localind;
991: }
992: }
993: }
994: *z = min;
995: if (idx) *idx = j;
996: VecRestoreArray(xin,&xa);
997: return(0);
998: }
1000: #include <../src/vec/vec/impls/seq/ftn-kernels/fxtimesy.h>
1001: PetscErrorCode VecPointwiseMult_Kernel(void *arg)
1002: {
1003: Vec_KernelData *data = (Vec_KernelData*)arg;
1004: PetscErrorCode ierr;
1005: Vec X=data->X;
1006: PetscInt thread_id=data->thread_id;
1007: PetscInt start,end;
1008: #if defined(PETSC_USE_FORTRAN_KERNEL_XTIMESY)
1009: PetscInt n;
1010: #endif
1011: PetscScalar *ww = data->w,*xx = data->x,*yy = data->y;
1012: PetscInt i;
1014: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
1015: if (ww == xx) {
1016: for (i=start; i<end; i++) ww[i] *= yy[i];
1017: } else if (ww == yy) {
1018: for (i=start; i<end; i++) ww[i] *= xx[i];
1019: } else {
1020: #if defined(PETSC_USE_FORTRAN_KERNEL_XTIMESY)
1021: n = end-start;
1022: fortranxtimesy_(xx,yy,ww,&n);
1023: #else
1024: for (i=start; i<end; i++) ww[i] = xx[i] * yy[i];
1025: #endif
1026: }
1027: return(0);
1028: }
1032: PetscErrorCode VecPointwiseMult_SeqPThread(Vec win,Vec xin,Vec yin)
1033: {
1034: PetscErrorCode ierr;
1035: PetscThreadsLayout tmap=win->map->tmap;
1036: PetscScalar *ya,*xa,*wa;
1037: PetscInt i;
1041: VecGetArray(xin,&xa);
1042: VecGetArray(yin,&ya);
1043: VecGetArray(win,&wa);
1045: for (i=0; i<tmap->nthreads; i++) {
1046: vec_kerneldatap[i].X = win;
1047: vec_kerneldatap[i].thread_id = i;
1048: vec_kerneldatap[i].w = wa;
1049: vec_kerneldatap[i].x = xa;
1050: vec_kerneldatap[i].y = ya;
1051: vec_pdata[i] = &vec_kerneldatap[i];
1052: }
1054: PetscThreadsRunKernel(VecPointwiseMult_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
1056: VecRestoreArray(xin,&xa);
1057: VecRestoreArray(yin,&ya);
1058: VecRestoreArray(win,&wa);
1059: PetscLogFlops(win->map->n);
1060: return(0);
1061: }
1063: PetscErrorCode VecPointwiseDivide_Kernel(void *arg)
1064: {
1065: Vec_KernelData *data = (Vec_KernelData*)arg;
1066: PetscErrorCode ierr;
1067: Vec X=data->X;
1068: PetscInt thread_id=data->thread_id;
1069: PetscInt start,end;
1070: PetscScalar *ww = data->w,*xx = data->x,*yy = data->y;
1071: PetscInt i;
1073: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
1074: for (i=start; i<end; i++) {
1075: ww[i] = xx[i] / yy[i];
1076: }
1077: return(0);
1078: }
1082: PetscErrorCode VecPointwiseDivide_SeqPThread(Vec win,Vec xin,Vec yin)
1083: {
1084: PetscErrorCode ierr;
1085: PetscThreadsLayout tmap=win->map->tmap;
1086: PetscScalar *ya,*xa,*wa;
1087: PetscInt i;
1091: VecGetArray(xin,&xa);
1092: VecGetArray(yin,&ya);
1093: VecGetArray(win,&wa);
1095: for (i=0; i<tmap->nthreads; i++) {
1096: vec_kerneldatap[i].X = win;
1097: vec_kerneldatap[i].thread_id = i;
1098: vec_kerneldatap[i].w = wa;
1099: vec_kerneldatap[i].x = xa;
1100: vec_kerneldatap[i].y = ya;
1101: vec_pdata[i] = &vec_kerneldatap[i];
1102: }
1104: PetscThreadsRunKernel(VecPointwiseDivide_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
1106: VecRestoreArray(xin,&xa);
1107: VecRestoreArray(yin,&ya);
1108: VecRestoreArray(win,&wa);
1109: PetscLogFlops(win->map->n);
1110: return(0);
1111: }
1113: #include <petscblaslapack.h>
1114: PetscErrorCode VecSwap_Kernel(void *arg)
1115: {
1116: Vec_KernelData *data = (Vec_KernelData*)arg;
1117: PetscErrorCode ierr;
1118: Vec X=data->X;
1119: PetscInt thread_id=data->thread_id;
1120: PetscInt n,start,end;
1121: PetscScalar *xa = data->x,*ya = data->y;
1122: PetscBLASInt one = 1,bn,bstart;
1124: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
1125: n = end-start;
1126: bn = PetscBLASIntCast(n);
1127: bstart = PetscBLASIntCast(start);
1128: BLASswap_(&bn,xa+bstart,&one,ya+bstart,&one);
1129: return(0);
1130: }
1134: PetscErrorCode VecSwap_SeqPThread(Vec xin,Vec yin)
1135: {
1136: PetscErrorCode ierr;
1137: PetscThreadsLayout tmap=xin->map->tmap;
1138: PetscScalar *ya,*xa;
1139: PetscInt i;
1143: if (xin != yin) {
1144: VecGetArray(xin,&xa);
1145: VecGetArray(yin,&ya);
1147: for (i=0; i<tmap->nthreads; i++) {
1148: vec_kerneldatap[i].X = xin;
1149: vec_kerneldatap[i].thread_id = i;
1150: vec_kerneldatap[i].x = xa;
1151: vec_kerneldatap[i].y = ya;
1152: vec_pdata[i] = &vec_kerneldatap[i];
1153: }
1155: PetscThreadsRunKernel(VecSwap_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
1156: VecRestoreArray(xin,&xa);
1157: VecRestoreArray(yin,&ya);
1158: }
1159: return(0);
1160: }
1162: PetscErrorCode VecSetRandom_Kernel(void *arg)
1163: {
1164: Vec_KernelData *data = (Vec_KernelData*)arg;
1165: PetscErrorCode ierr;
1166: Vec X=data->X;
1167: PetscInt thread_id=data->thread_id;
1168: PetscInt start,end;
1169: PetscScalar *xx = data->x;
1170: PetscRandom r = data->rand;
1171: PetscInt i;
1173: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
1174: for(i=start; i<end; i++) {
1175: PetscRandomGetValue(r,&xx[i]);
1176: }
1177: return(0);
1178: }
1182: PetscErrorCode VecSetRandom_SeqPThread(Vec xin,PetscRandom r)
1183: {
1184: PetscErrorCode ierr;
1185: PetscThreadsLayout tmap=xin->map->tmap;
1186: PetscInt i;
1187: PetscScalar *xa;
1191: VecGetArray(xin,&xa);
1193: for (i=0; i<tmap->nthreads; i++) {
1194: vec_kerneldatap[i].X = xin;
1195: vec_kerneldatap[i].thread_id = i;
1196: vec_kerneldatap[i].x = xa;
1197: vec_kerneldatap[i].rand = r;
1198: vec_pdata[i] = &vec_kerneldatap[i];
1199: }
1201: PetscThreadsRunKernel(VecSetRandom_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
1202: VecRestoreArray(xin,&xa);
1203: return(0);
1204: }
1206: PetscErrorCode VecCopy_Kernel(void *arg)
1207: {
1208: Vec_KernelData *data = (Vec_KernelData*)arg;
1209: PetscErrorCode ierr;
1210: Vec X=data->X;
1211: PetscInt thread_id=data->thread_id;
1212: PetscInt start,end;
1213: const PetscScalar *xa = (const PetscScalar*)data->x;
1214: PetscScalar *ya = data->y;
1215: PetscInt n;
1217: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
1218: n = end-start;
1219: PetscMemcpy(ya+start,xa+start,n*sizeof(PetscScalar));
1220: return(0);
1221: }
1225: PetscErrorCode VecCopy_SeqPThread(Vec xin,Vec yin)
1226: {
1228: PetscErrorCode ierr;
1229: PetscThreadsLayout tmap=yin->map->tmap;
1230: PetscScalar *ya,*xa;
1231: PetscInt i;
1235: if (xin != yin) {
1236: VecGetArray(xin,&xa);
1237: VecGetArray(yin,&ya);
1239: for (i=0; i<tmap->nthreads; i++) {
1240: vec_kerneldatap[i].X = yin;
1241: vec_kerneldatap[i].thread_id = i;
1242: vec_kerneldatap[i].x = xa;
1243: vec_kerneldatap[i].y = ya;
1244: vec_pdata[i] = &vec_kerneldatap[i];
1245: }
1246: PetscThreadsRunKernel(VecCopy_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
1248: VecRestoreArray(xin,&xa);
1249: VecRestoreArray(yin,&ya);
1250: }
1251: return(0);
1252: }
1254: PetscErrorCode VecMAXPY_Kernel(void* arg)
1255: {
1256: Vec_KernelData *data = (Vec_KernelData*)arg;
1257: PetscErrorCode ierr;
1258: Vec X=data->X;
1259: PetscInt thread_id=data->thread_id;
1260: PetscInt start,end;
1261: PetscInt n,nv=data->nvec,j,j_rem;
1262: const PetscScalar *alpha=data->amult,*yy0,*yy1,*yy2,*yy3;
1263: PetscScalar *xx,alpha0,alpha1,alpha2,alpha3;
1264: Vec* y = (Vec*)data->yvec;
1266: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
1267: xx = data->x+start;
1268: n = end-start;
1269: switch (j_rem=nv&0x3) {
1270: case 3:
1271: VecGetArrayRead(y[0],&yy0);
1272: VecGetArrayRead(y[1],&yy1);
1273: VecGetArrayRead(y[2],&yy2);
1274: yy0 += start; yy1 += start; yy2 += start;
1275: alpha0 = alpha[0];
1276: alpha1 = alpha[1];
1277: alpha2 = alpha[2];
1278: alpha += 3;
1279: PetscAXPY3(xx,alpha0,alpha1,alpha2,yy0,yy1,yy2,n);
1280: VecRestoreArrayRead(y[0],&yy0);
1281: VecRestoreArrayRead(y[1],&yy1);
1282: VecRestoreArrayRead(y[2],&yy2);
1283: y += 3;
1284: break;
1285: case 2:
1286: VecGetArrayRead(y[0],&yy0);
1287: VecGetArrayRead(y[1],&yy1);
1288: yy0 += start; yy1 += start;
1289: alpha0 = alpha[0];
1290: alpha1 = alpha[1];
1291: alpha +=2;
1292: PetscAXPY2(xx,alpha0,alpha1,yy0,yy1,n);
1293: VecRestoreArrayRead(y[0],&yy0);
1294: VecRestoreArrayRead(y[1],&yy1);
1295: y +=2;
1296: break;
1297: case 1:
1298: VecGetArrayRead(y[0],&yy0);
1299: yy0 += start; yy1 += start;
1300: alpha0 = *alpha++;
1301: PetscAXPY(xx,alpha0,yy0,n);
1302: VecRestoreArrayRead(y[0],&yy0);
1303: y +=1;
1304: break;
1305: }
1306: for (j=j_rem; j<nv; j+=4) {
1307: VecGetArrayRead(y[0],&yy0);
1308: VecGetArrayRead(y[1],&yy1);
1309: VecGetArrayRead(y[2],&yy2);
1310: VecGetArrayRead(y[3],&yy3);
1311: yy0 += start; yy1 += start; yy2 += start; yy3 += start;
1312: alpha0 = alpha[0];
1313: alpha1 = alpha[1];
1314: alpha2 = alpha[2];
1315: alpha3 = alpha[3];
1316: alpha += 4;
1318: PetscAXPY4(xx,alpha0,alpha1,alpha2,alpha3,yy0,yy1,yy2,yy3,n);
1319: VecRestoreArrayRead(y[0],&yy0);
1320: VecRestoreArrayRead(y[1],&yy1);
1321: VecRestoreArrayRead(y[2],&yy2);
1322: VecRestoreArrayRead(y[3],&yy3);
1323: y += 4;
1324: }
1325: return(0);
1326: }
1330: PetscErrorCode VecMAXPY_SeqPThread(Vec xin, PetscInt nv,const PetscScalar *alpha,Vec *yin)
1331: {
1332: PetscErrorCode ierr;
1333: PetscThreadsLayout tmap=xin->map->tmap;
1334: PetscInt i;
1335: Vec *yy = (Vec *)yin;
1336: PetscScalar *xa;
1340: VecGetArray(xin,&xa);
1341: for (i=0; i<tmap->nthreads; i++) {
1342: vec_kerneldatap[i].X = xin;
1343: vec_kerneldatap[i].thread_id = i;
1344: vec_kerneldatap[i].x = xa;
1345: vec_kerneldatap[i].yvec = yy;
1346: vec_kerneldatap[i].amult = &alpha[0];
1347: vec_kerneldatap[i].nvec = nv;
1348: vec_pdata[i] = &vec_kerneldatap[i];
1349: }
1350: PetscThreadsRunKernel(VecMAXPY_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
1352: VecRestoreArray(xin,&xa);
1353: PetscLogFlops(nv*2.0*xin->map->n);
1354: return(0);
1355: }
1357: PetscErrorCode VecSet_Kernel(void *arg)
1358: {
1359: Vec_KernelData *data = (Vec_KernelData*)arg;
1360: PetscErrorCode ierr;
1361: Vec X=data->X;
1362: PetscInt thread_id=data->thread_id;
1363: PetscInt start,end;
1364: PetscScalar *xx = data->x;
1365: PetscScalar alpha = data->alpha;
1366: PetscInt i,n;
1368: VecGetThreadOwnershipRange(X,thread_id,&start,&end);
1369: n = end-start;
1370: if (alpha == (PetscScalar)0.0) {
1371: PetscMemzero(xx+start,n*sizeof(PetscScalar));
1372: } else {
1373: for (i=start; i<end; i++) xx[i] = alpha;
1374: }
1375: return(0);
1376: }
1380: PetscErrorCode VecSet_SeqPThread(Vec xin,PetscScalar alpha)
1381: {
1382: PetscErrorCode ierr;
1383: PetscThreadsLayout tmap=xin->map->tmap;
1384: PetscInt i;
1385: PetscScalar *xa;
1389: VecGetArray(xin,&xa);
1391: for (i=0; i<tmap->nthreads; i++) {
1392: vec_kerneldatap[i].X = xin;
1393: vec_kerneldatap[i].thread_id = i;
1394: vec_kerneldatap[i].x = xa;
1395: vec_kerneldatap[i].alpha = alpha;
1396: vec_pdata[i] = &vec_kerneldatap[i];
1397: }
1398: PetscThreadsRunKernel(VecSet_Kernel,(void**)vec_pdata,tmap->nthreads,tmap->affinity);
1399: VecRestoreArray(xin,&xa);
1400: return(0);
1401: }
1405: PetscErrorCode VecDestroy_SeqPThread(Vec v)
1406: {
1407: Vec_Seq *vs = (Vec_Seq*)v->data;
1411: PetscObjectDepublish(v);
1413: #if defined(PETSC_USE_LOG)
1414: PetscLogObjectState((PetscObject)v,"Length=%D",v->map->n);
1415: #endif
1416: PetscFree(vs->array_allocated);
1417: PetscFree(v->data);
1419: vecs_created--;
1420: /* Free the kernel data structure on the destruction of the last vector */
1421: if (!vecs_created) {
1422: PetscFree(vec_kerneldatap);
1423: PetscFree(vec_pdata);
1424: }
1426: return(0);
1427: }
1431: PetscErrorCode VecDuplicate_SeqPThread(Vec win,Vec *V)
1432: {
1436: VecCreate(((PetscObject)win)->comm,V);
1437: PetscObjectSetPrecision((PetscObject)*V,((PetscObject)win)->precision);
1438: VecSetSizes(*V,win->map->n,win->map->n);
1439: PetscLayoutReference(win->map,&(*V)->map);
1440: VecSetType(*V,((PetscObject)win)->type_name);
1441: PetscOListDuplicate(((PetscObject)win)->olist,&((PetscObject)(*V))->olist);
1442: PetscFListDuplicate(((PetscObject)win)->qlist,&((PetscObject)(*V))->qlist);
1444: (*V)->stash.ignorenegidx = win->stash.ignorenegidx;
1446: return(0);
1447: }
1451: /*@
1452: VecSetNThreads - Set the number of threads to be used for vector operations.
1454: Input Parameters
1455: + v - the vector
1456: - nthreads - number of threads
1458: Note:
1459: Use nthreads = PETSC_DECIDE for PETSc to determine the number of threads.
1461: Options Database keys:
1462: -vec_threads <nthreads> - Number of threads
1464: Level: intermediate
1466: Concepts: vectors^number of threads
1468: .seealso: VecCreateSeqPThread(), VecGetNThreads()
1469: @*/
1470: PetscErrorCode VecSetNThreads(Vec v,PetscInt nthreads)
1471: {
1472: PetscErrorCode ierr;
1473: PetscThreadsLayout tmap=v->map->tmap;
1474: PetscInt nworkThreads=PetscMaxThreads+PetscMainThreadShareWork;
1478: if(!tmap) {
1479: PetscThreadsLayoutCreate(&tmap);
1480: v->map->tmap = tmap;
1481: }
1483: if(nthreads == PETSC_DECIDE) {
1484: tmap->nthreads = nworkThreads;
1485: PetscOptionsInt("-vec_threads","Set number of threads to be used for vector operations","VecSetNThreads",nworkThreads,&tmap->nthreads,PETSC_NULL);
1486: if(tmap->nthreads > nworkThreads) {
1487: SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ, "Vec x: threads requested %D, Max. threads initialized %D",tmap->nthreads,nworkThreads);
1488: }
1489: } else {
1490: if(nthreads > nworkThreads) {
1491: SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ, "Vec x: threads requested %D, Max. threads initialized %D",nthreads,nworkThreads);
1492: }
1493: tmap->nthreads = nthreads;
1494: }
1496: return(0);
1497: }
1501: /*@
1502: VecGetNThreads - Returns the number of threads used for vector operations.
1504: Input Parameter
1505: . v - the vector
1507: Output Parameter
1508: . nthreads - number of threads
1510: Level: intermediate
1512: Concepts: vectors^number of threads
1514: .seealso: VecSetNThreads()
1515: @*/
1516: PetscErrorCode VecGetNThreads(Vec v,PetscInt *nthreads)
1517: {
1518: PetscThreadsLayout tmap=v->map->tmap;
1520: *nthreads = tmap->nthreads;
1521: return(0);
1522: }
1526: /*@
1527: VecSetThreadAffinities - Sets the CPU affinities of vector threads.
1529: Input Parameters
1530: + v - the vector
1531: - affinities - list of cpu affinities for threads.
1533: Notes:
1534: Must set affinities for all the threads used with the vector (not including the main thread)
1535:
1536: Use affinities[] = PETSC_NULL for PETSc to decide the thread affinities.
1538: Options Database Keys:
1539: -vec_thread_affinities - Comma seperated list of thread affinities
1541: Level: intermediate
1543: Concepts: vectors^thread cpu affinity
1545: .seealso: VecGetThreadAffinities()
1546: @*/
1547: PetscErrorCode VecSetThreadAffinities(Vec v,const PetscInt affinities[])
1548: {
1549: PetscErrorCode ierr;
1550: PetscThreadsLayout tmap = v->map->tmap;
1551: PetscInt nmax=PetscMaxThreads+PetscMainThreadShareWork;
1552: PetscBool flg;
1556: if(!tmap) {
1557: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Must set the number of threads before setting thread affinities");
1558: }
1560: PetscMalloc(tmap->nthreads*sizeof(PetscInt),&tmap->affinity);
1562: if(affinities == PETSC_NULL) {
1563: /* PETSc decides affinities */
1564: PetscInt *thread_affinities;
1565: PetscMalloc(nmax*sizeof(PetscInt),&thread_affinities);
1566: /* Check if run-time option is set */
1567: PetscOptionsIntArray("-vec_thread_affinities","Set CPU affinities of vector threads","VecSetThreadAffinities",thread_affinities,&nmax,&flg);
1568: if(flg) {
1569: if(nmax != tmap->nthreads) {
1570: SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Must set affinities for all threads, vector Threads = %D, CPU affinities set = %D",tmap->nthreads,nmax);
1571: }
1572: PetscMemcpy(tmap->affinity,thread_affinities,tmap->nthreads*sizeof(PetscInt));
1573: } else {
1574: /* Reuse the core affinities set for the first nthreads */
1575: PetscMemcpy(tmap->affinity,PetscThreadsCoreAffinities,tmap->nthreads*sizeof(PetscInt));
1576: }
1577: PetscFree(thread_affinities);
1578: } else {
1579: /* Set user provided affinities */
1580: PetscMemcpy(tmap->affinity,affinities,tmap->nthreads*sizeof(PetscInt));
1581: }
1583: return(0);
1584: }
1588: PetscErrorCode VecView_SeqPthread(Vec xin,PetscViewer viewer)
1589: {
1590: PetscErrorCode ierr;
1591: PetscViewerFormat format;
1594: VecView_Seq(xin,viewer);
1595: PetscViewerGetFormat(viewer,&format);
1596: if (format == PETSC_VIEWER_ASCII_INFO || format == PETSC_VIEWER_ASCII_INFO_DETAIL) {
1597: PetscViewerASCIIPushTab(viewer);
1598: PetscViewerASCIIPrintf(viewer,"Number threads used=%D\n",xin->map->tmap->nthreads);
1599: PetscViewerASCIIPopTab(viewer);
1600: }
1601: return(0);
1602: }
1605: static struct _VecOps DvOps = {VecDuplicate_SeqPThread, /* 1 */
1606: VecDuplicateVecs_Default,
1607: VecDestroyVecs_Default,
1608: VecDot_SeqPThread,
1609: VecMDot_SeqPThread,
1610: VecNorm_SeqPThread,
1611: VecTDot_SeqPThread,
1612: VecMTDot_SeqPThread,
1613: VecScale_SeqPThread,
1614: VecCopy_SeqPThread, /* 10 */
1615: VecSet_SeqPThread,
1616: VecSwap_SeqPThread,
1617: VecAXPY_SeqPThread,
1618: VecAXPBY_SeqPThread,
1619: VecMAXPY_SeqPThread,
1620: VecAYPX_SeqPThread,
1621: VecWAXPY_SeqPThread,
1622: VecAXPBYPCZ_Seq,
1623: VecPointwiseMult_SeqPThread,
1624: VecPointwiseDivide_SeqPThread,
1625: VecSetValues_Seq, /* 20 */
1626: 0,0,
1627: 0,
1628: VecGetSize_Seq,
1629: VecGetSize_Seq,
1630: 0,
1631: VecMax_SeqPThread,
1632: VecMin_SeqPThread,
1633: VecSetRandom_SeqPThread,
1634: VecSetOption_Seq, /* 30 */
1635: VecSetValuesBlocked_Seq,
1636: VecDestroy_SeqPThread,
1637: VecView_SeqPthread,
1638: VecPlaceArray_Seq,
1639: VecReplaceArray_Seq,
1640: VecDot_SeqPThread,
1641: VecTDot_SeqPThread,
1642: VecNorm_SeqPThread,
1643: VecMDot_SeqPThread,
1644: VecMTDot_SeqPThread, /* 40 */
1645: VecLoad_Default,
1646: VecReciprocal_Default,
1647: VecConjugate_Seq,
1648: 0,
1649: 0,
1650: VecResetArray_Seq,
1651: 0,
1652: VecMaxPointwiseDivide_Seq,
1653: VecPointwiseMax_Seq,
1654: VecPointwiseMaxAbs_Seq,
1655: VecPointwiseMin_Seq,
1656: VecGetValues_Seq,
1657: 0,
1658: 0,
1659: 0,
1660: 0,
1661: 0,
1662: 0,
1663: VecStrideGather_Default,
1664: VecStrideScatter_Default
1665: };
1669: PetscErrorCode VecCreate_SeqPThread_Private(Vec v,const PetscScalar array[])
1670: {
1671: Vec_Seq *s;
1672: PetscErrorCode ierr;
1673: PetscThreadsLayout tmap=v->map->tmap;
1676: PetscNewLog(v,Vec_Seq,&s);
1677: PetscMemcpy(v->ops,&DvOps,sizeof(DvOps));
1678: v->data = (void*)s;
1679: v->petscnative = PETSC_TRUE;
1680: s->array = (PetscScalar *)array;
1681: s->array_allocated = 0;
1683: if(!v->map->tmap) {
1684: PetscThreadsLayoutCreate(&v->map->tmap);
1685: tmap = v->map->tmap;
1686: }
1688: /* If this is the first vector being created then also create the common Kernel data structure */
1689: if(vecs_created == 0) {
1690: PetscMalloc((PetscMaxThreads+PetscMainThreadShareWork)*sizeof(Vec_KernelData),&vec_kerneldatap);
1691: PetscMalloc((PetscMaxThreads+PetscMainThreadShareWork)*sizeof(Vec_KernelData*),&vec_pdata);
1692: }
1693: vecs_created++;
1695: PetscLayoutSetUp(v->map);
1697: tmap->N = v->map->n;
1698:
1699: /* Set the number of threads */
1700: if(tmap->nthreads == PETSC_DECIDE) {
1701: VecSetNThreads(v,PETSC_DECIDE);
1702: }
1703: /* Set thread affinities */
1704: if(!tmap->affinity) {
1705: VecSetThreadAffinities(v,PETSC_NULL);
1706: }
1708: PetscThreadsLayoutSetUp(tmap);
1710: PetscObjectChangeTypeName((PetscObject)v,VECSEQPTHREAD);
1712: return(0);
1713: }
1715: /*MC
1716: VECSEQPTHREAD - VECSEQPTHREAD = "seqpthread" - The basic sequential vector using posix threads
1718: Options Database Keys:
1719: . -vec_type seqpthread - sets the vector type to VECSEQPTHREAD during a call to VecSetFromOptions()
1721: Level: intermediate
1723: .seealso: VecCreate(), VecCreateSeqPThread(), VecSetType(), VecSetFromOptions(), VECSEQ
1724: M*/
1726: EXTERN_C_BEGIN
1729: PetscErrorCode VecCreate_SeqPThread(Vec V)
1730: {
1731: Vec_Seq *s;
1732: PetscScalar *array;
1733: PetscErrorCode ierr;
1734: PetscInt n = PetscMax(V->map->n,V->map->N);
1735: PetscMPIInt size;
1738: MPI_Comm_size(((PetscObject)V)->comm,&size);
1739: if (size > 1) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Cannot create VECSEQPTHREAD on more than one process");
1740: PetscThreadsInitialize(PetscMaxThreads);
1741: PetscMalloc(n*sizeof(PetscScalar),&array);
1742: PetscLogObjectMemory(V, n*sizeof(PetscScalar));
1743: VecCreate_SeqPThread_Private(V,array);
1744: VecSet_SeqPThread(V,0.0);
1745: s = (Vec_Seq*)V->data;
1746: s->array_allocated = (PetscScalar*)s->array;
1748: return(0);
1749: }
1750: EXTERN_C_END
1754: /*@
1755: VecCreateSeqPThread - Creates a standard, sequential array-style vector using posix threads.
1757: Collective on MPI_Comm
1759: Input Parameter:
1760: + comm - the communicator, should be PETSC_COMM_SELF
1761: . n - the vector length
1762: . nthreads - number of threads
1763: - affinities - thread affinities
1765: Output Parameter:
1766: . V - the vector
1768: Notes:
1769: Use VecDuplicate() or VecDuplicateVecs() to form additional vectors of the
1770: same type as an existing vector.
1772: Use nthreads = PETSC_DECIDE for PETSc to decide the number of threads and
1773: affinities = PETSC_NULL to decide the thread affinities.
1775: Options Database Keys:
1776: -vec_threads <nthreads> - Sets number of threads to be used for vector operations
1777: -vec_thread_affinities - Comma seperated list of thread affinities
1779: Level: intermediate
1781: Concepts: vectors^creating sequential with threads
1783: .seealso: VecCreateSeq(), VecSetNThreads(), VecSetThreadAffinities(), VecDuplicate(), VecDuplicateVecs()
1784: @*/
1785: PetscErrorCode VecCreateSeqPThread(MPI_Comm comm,PetscInt n,PetscInt nthreads,PetscInt affinities[],Vec *v)
1786: {
1790: VecCreate(comm,v);
1791: VecSetSizes(*v,n,n);
1792: VecSetNThreads(*v,nthreads);
1793: VecSetThreadAffinities(*v,affinities);
1794: VecSetType(*v,VECSEQPTHREAD);
1795: return(0);
1796: }