| #include "config.h" |
| |
| #ifdef USE_ROLL_ASM /* { */ |
| |
| #define CHAR_OFFSET 0 /* Keep this the same as rsync.h, which isn't likely to change. */ |
| |
| #ifdef __APPLE__ |
| #define get_checksum1_avx2_asm _get_checksum1_avx2_asm |
| #endif |
| |
| .intel_syntax noprefix |
| .text |
| |
| .p2align 5 |
| .globl get_checksum1_avx2_asm |
| |
| # rdi=*buf, esi=len, edx=i, rcx= *ps1, r8= *ps2 |
| get_checksum1_avx2_asm: |
| vmovd xmm6,[rcx] # load *ps1 |
| lea eax, [rsi-128] # at least 128 bytes to process? |
| cmp edx, eax |
| jg .exit |
| lea rax, .mul_T2[rip] |
| vmovntdqa ymm7, [rax] # load T2 multiplication constants |
| vmovntdqa ymm12,[rax+32]# from memory. |
| vpcmpeqd ymm15, ymm15, ymm15 # set all elements to -1. |
| |
| #if CHAR_OFFSET != 0 |
| mov eax, 32*CHAR_OFFSET |
| vmovd xmm10, eax |
| vpbroadcastd ymm10, xmm10 |
| mov eax, 528*CHAR_OFFSET |
| vmovd xmm13, eax |
| vpbroadcastd ymm13, xmm13 |
| #endif |
| vpabsb ymm15, ymm15 # set all byte size elements to 1. |
| add rdi, rdx |
| vmovdqu ymm2, [rdi] # preload the first 64 bytes. |
| vmovdqu ymm3, [rdi+32] |
| and esi, ~63 # only needed during final reduction, |
| # done here to avoid a longer nop for |
| # alignment below. |
| add edx, esi |
| shr rsi, 6 # longer opcode for alignment |
| add rdi, 64 |
| vpxor xmm1, xmm1, xmm1 # reset both partial sums accumulators. |
| vpxor xmm4, xmm4, xmm4 |
| mov eax, [r8] |
| .p2align 4 # should fit into the LSD allocation queue. |
| .loop: |
| vpmaddubsw ymm0, ymm15, ymm2 # s1 partial sums |
| vpmaddubsw ymm5, ymm15, ymm3 |
| vmovdqu ymm8, [rdi] # preload the next |
| vmovdqu ymm9, [rdi+32] # 64 bytes. |
| add rdi, 64 |
| vpaddd ymm4, ymm4, ymm6 |
| vpaddw ymm5, ymm5, ymm0 |
| vpsrld ymm0, ymm5, 16 |
| vpaddw ymm5, ymm0, ymm5 |
| vpaddd ymm6, ymm5, ymm6 |
| vpmaddubsw ymm2, ymm7, ymm2 # s2 partial sums |
| vpmaddubsw ymm3, ymm12, ymm3 |
| prefetcht0 [rdi+384] # prefetch 6 cachelines ahead. |
| vpaddw ymm3, ymm2, ymm3 |
| vpsrldq ymm2, ymm3, 2 |
| vpaddd ymm3, ymm2, ymm3 |
| vpaddd ymm1, ymm1, ymm3 |
| |
| #if CHAR_OFFSET != 0 |
| vpaddd ymm6, ymm10, ymm6 # 32*CHAR_OFFSET |
| vpaddd ymm1, ymm13, ymm1 # 528*CHAR_OFFSET |
| #endif |
| vmovdqa ymm2, ymm8 # move the next 64 bytes |
| vmovdqa ymm3, ymm9 # into the right registers |
| sub esi, 1 |
| jnz .loop |
| |
| # now we reduce the partial sums. |
| vpslld ymm3, ymm4, 6 |
| vpsrldq ymm2, ymm6, 4 |
| |
| vpaddd ymm0, ymm3, ymm1 |
| vpaddd ymm6, ymm2, ymm6 |
| vpsrlq ymm3, ymm0, 32 |
| |
| vpsrldq ymm2, ymm6, 8 |
| vpaddd ymm0, ymm3, ymm0 |
| vpsrldq ymm3, ymm0, 8 |
| vpaddd ymm6, ymm2, ymm6 |
| vpaddd ymm0, ymm3, ymm0 |
| vextracti128 xmm2, ymm6, 0x1 |
| vextracti128 xmm1, ymm0, 0x1 |
| vpaddd xmm6, xmm2, xmm6 |
| vmovd [rcx], xmm6 |
| vpaddd xmm1, xmm1, xmm0 |
| vmovd ecx, xmm1 |
| add eax, ecx |
| mov [r8], eax |
| .exit: |
| vzeroupper |
| mov eax, edx |
| ret |
| |
| #ifdef __APPLE__ |
| .data |
| .align 6 |
| #else |
| .section .rodata |
| .p2align 6 |
| #endif |
| .mul_T2: |
| .byte 64 |
| .byte 63 |
| .byte 62 |
| .byte 61 |
| .byte 60 |
| .byte 59 |
| .byte 58 |
| .byte 57 |
| .byte 56 |
| .byte 55 |
| .byte 54 |
| .byte 53 |
| .byte 52 |
| .byte 51 |
| .byte 50 |
| .byte 49 |
| .byte 48 |
| .byte 47 |
| .byte 46 |
| .byte 45 |
| .byte 44 |
| .byte 43 |
| .byte 42 |
| .byte 41 |
| .byte 40 |
| .byte 39 |
| .byte 38 |
| .byte 37 |
| .byte 36 |
| .byte 35 |
| .byte 34 |
| .byte 33 |
| .byte 32 |
| .byte 31 |
| .byte 30 |
| .byte 29 |
| .byte 28 |
| .byte 27 |
| .byte 26 |
| .byte 25 |
| .byte 24 |
| .byte 23 |
| .byte 22 |
| .byte 21 |
| .byte 20 |
| .byte 19 |
| .byte 18 |
| .byte 17 |
| .byte 16 |
| .byte 15 |
| .byte 14 |
| .byte 13 |
| .byte 12 |
| .byte 11 |
| .byte 10 |
| .byte 9 |
| .byte 8 |
| .byte 7 |
| .byte 6 |
| .byte 5 |
| .byte 4 |
| .byte 3 |
| .byte 2 |
| .byte 1 |
| |
| #endif /* } USE_ROLL_ASM */ |