;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; ; interpolate8x8_halfpel_hv_asm_dm642 ;; ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; .sect ".text:_interpolate8x8_halfpel_hv_asm_dm642" .global _interpolate8x8_halfpel_hv_asm_dm642 _interpolate8x8_halfpel_hv_asm_dm642 .cproc A_dst0, B_src0, A_stride,B_stride .no_mdep ;存储区无依赖性 .reg A_dst1,B_src1,B_src2 .reg A_i,A_pitch,B_pitch .reg A_1111,B_1111,A_22,B_22 .reg A_7654:A_3210,A_8765:A_4321, B_7654:B_3210,B_8765:B_4321 .reg C_7654:C_3210,C_8765:C_4321, D_7654:D_3210,E_7654:E_3210 .reg A_0,A_1,A_2,A_3,A_4,A_5,A_6,A_7 .reg C_1010,C_3232,C_5454,C_7676,C_2121,C_4343,C_6565,C_8787 .reg A_76,A_54,A_32,A_10,B_76,B_54,B_32,B_10 .reg C_76,C_54,C_32,C_10,D_76,D_54,D_32,D_10 .reg E_76,E_54,E_32,E_10,F_76,F_54,F_32,F_10 ADD B_src0, B_stride, B_src1 ADD B_src1, B_stride, B_src2 ADD A_dst0, A_stride, A_dst1 SHR B_stride, 2, B_pitch SHR A_stride, 2, A_pitch MVKL 0x01010101, A_1111 ;用来点积,实质是相加 MVKH 0x01010101, A_1111 MVKL 0x00020002, A_22 ;用来右移 MVKH 0x00020002, A_22 MV A_22, B_22 MV A_1111, B_1111 MVK 3, A_i LOOP_HV_88: .trip 4,4,4 LDNDW *+B_src2(1), C_8765:C_4321 ;地址不对齐读取8字节数据 LDNDW *B_src2++[B_pitch], C_7654:C_3210 LDNDW *+B_src1(1), B_8765:B_4321 LDNDW *B_src1++[B_pitch], B_7654:B_3210 LDNDW *+B_src0(1), A_8765:A_4321 LDNDW *B_src0++[B_pitch], A_7654:A_3210 PACK2 A_3210, B_3210, C_1010 ;打包低16位 PACKH2 B_3210, A_3210, C_3232 ;打包高16位 PACK2 A_7654, B_7654, C_5454 PACKH2 B_7654, A_7654, C_7676 PACK2 A_4321, B_4321, C_2121 PACKH2 B_4321, A_4321, C_4343 PACK2 A_8765, B_8765, C_6565 PACKH2 B_8765, A_8765, C_8787 ;; DOTPU4 C_1010, A_1111, A_0 ;与1点积,即4个元素相加 DOTPU4 C_2121, A_1111, A_1 DOTPU4 C_3232, A_1111, A_2 DOTPU4 C_4343, A_1111, A_3 DOTPU4 C_5454, A_1111, A_4 DOTPU4 C_6565, A_1111, A_5 DOTPU4 C_7676, A_1111, A_6 DOTPU4 C_8787, A_1111, A_7 ;; PACK2 A_1, A_0, A_10 ;低16位打包 PACK2 A_3, A_2, A_32 PACK2 A_5, A_4, A_54 PACK2 A_7, A_6, A_76 ;; ADD2 A_22, A_10, C_10 ;两个16位数分别加2 ADD2 A_22, A_32, C_32 ADD2 A_22, A_54, C_54 ADD2 A_22, A_76, C_76 ;; SHRU2 C_10, A_22, E_10 ;两个16位数分别右移2位 SHRU2 C_32, A_22, E_32 SHRU2 C_54, A_22, E_54 SHRU2 C_76, A_22, E_76 ;; PACKL4 E_32, E_10, D_3210 ;偶位字节提取组合成一个新数 PACKL4 E_76, E_54, D_7654 ;; STDW D_7654:D_3210, *A_dst0++[A_pitch] ;存储8字节结果 ;;[A_i] BDEC LOOP_HV_88, A_i .return .endproc |