FFmpeg  4.4.4
hevc_mc_uni_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35  mask0, mask1, mask2, mask3, \
36  filt0, filt1, filt2, filt3, \
37  out0, out1) \
38 { \
39  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
40  \
41  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46  DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
49 }
50 
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52  mask0, mask1, mask2, mask3, \
53  filt0, filt1, filt2, filt3, \
54  out0, out1, out2, out3) \
55 { \
56  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
57  \
58  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61  out0, out1, out2, out3); \
62  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65  out0, out1, out2, out3); \
66  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69  out0, out1, out2, out3); \
70  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73  out0, out1, out2, out3); \
74 }
75 
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77  mask0, mask1, filt0, filt1, \
78  out0, out1) \
79 { \
80  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
81  \
82  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
86 }
87 
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89  mask0, mask1, filt0, filt1, \
90  out0, out1, out2, out3) \
91 { \
92  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
93  \
94  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97  out0, out1, out2, out3); \
98  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101  out0, out1, out2, out3); \
102 }
103 
104 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105  uint8_t *dst, int32_t dst_stride,
106  int32_t height)
107 {
108  int32_t cnt;
109  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
110 
111  if (2 == height) {
112  LD2(src, src_stride, out0, out1);
113  SD(out0, dst);
114  dst += dst_stride;
115  SD(out1, dst);
116  } else if (6 == height) {
117  LD4(src, src_stride, out0, out1, out2, out3);
118  src += (4 * src_stride);
119  SD4(out0, out1, out2, out3, dst, dst_stride);
120  dst += (4 * dst_stride);
121  LD2(src, src_stride, out0, out1);
122  SD(out0, dst);
123  dst += dst_stride;
124  SD(out1, dst);
125  } else if (0 == (height % 8)) {
126  for (cnt = (height >> 3); cnt--;) {
127  LD4(src, src_stride, out0, out1, out2, out3);
128  src += (4 * src_stride);
129  LD4(src, src_stride, out4, out5, out6, out7);
130  src += (4 * src_stride);
131  SD4(out0, out1, out2, out3, dst, dst_stride);
132  dst += (4 * dst_stride);
133  SD4(out4, out5, out6, out7, dst, dst_stride);
134  dst += (4 * dst_stride);
135  }
136  } else if (0 == (height % 4)) {
137  for (cnt = (height >> 2); cnt--;) {
138  LD4(src, src_stride, out0, out1, out2, out3);
139  src += (4 * src_stride);
140  SD4(out0, out1, out2, out3, dst, dst_stride);
141  dst += (4 * dst_stride);
142  }
143  }
144 }
145 
146 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147  uint8_t *dst, int32_t dst_stride,
148  int32_t height)
149 {
150  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
151 
152  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153  src += (8 * src_stride);
154  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155  dst += (8 * dst_stride);
156  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
158 }
159 
160 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161  uint8_t *dst, int32_t dst_stride,
162  int32_t height)
163 {
164  int32_t cnt;
165  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166 
167  if (12 == height) {
168  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169  src += (8 * src_stride);
170  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171  dst += (8 * dst_stride);
172  LD_UB4(src, src_stride, src0, src1, src2, src3);
173  src += (4 * src_stride);
174  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175  dst += (4 * dst_stride);
176  } else if (0 == (height % 8)) {
177  for (cnt = (height >> 3); cnt--;) {
178  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
179  src7);
180  src += (8 * src_stride);
181  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
182  dst_stride);
183  dst += (8 * dst_stride);
184  }
185  } else if (0 == (height % 4)) {
186  for (cnt = (height >> 2); cnt--;) {
187  LD_UB4(src, src_stride, src0, src1, src2, src3);
188  src += (4 * src_stride);
189 
190  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191  dst += (4 * dst_stride);
192  }
193  }
194 }
195 
196 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197  uint8_t *dst, int32_t dst_stride,
198  int32_t height)
199 {
200  int32_t cnt;
201  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
203 
204  for (cnt = 4; cnt--;) {
205  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206  LD4(src + 16, src_stride, out0, out1, out2, out3);
207  src += (4 * src_stride);
208  LD4(src + 16, src_stride, out4, out5, out6, out7);
209  src += (4 * src_stride);
210 
211  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212  SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213  dst += (4 * dst_stride);
214  SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215  dst += (4 * dst_stride);
216  }
217 }
218 
219 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220  uint8_t *dst, int32_t dst_stride,
221  int32_t height)
222 {
223  int32_t cnt;
224  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
225 
226  for (cnt = (height >> 2); cnt--;) {
227  LD_UB4(src, src_stride, src0, src1, src2, src3);
228  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229  src += (4 * src_stride);
230  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232  dst += (4 * dst_stride);
233  }
234 }
235 
236 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237  uint8_t *dst, int32_t dst_stride,
238  int32_t height)
239 {
240  int32_t cnt;
241  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
242  v16u8 src11;
243 
244  for (cnt = (height >> 2); cnt--;) {
245  LD_UB4(src, src_stride, src0, src1, src2, src3);
246  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247  LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248  src += (4 * src_stride);
249 
250  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252  ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253  dst += (4 * dst_stride);
254  }
255 }
256 
257 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258  uint8_t *dst, int32_t dst_stride,
259  int32_t height)
260 {
261  int32_t cnt;
262  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
264 
265  for (cnt = (height >> 2); cnt--;) {
266  LD_UB4(src, 16, src0, src1, src2, src3);
267  src += src_stride;
268  LD_UB4(src, 16, src4, src5, src6, src7);
269  src += src_stride;
270  LD_UB4(src, 16, src8, src9, src10, src11);
271  src += src_stride;
272  LD_UB4(src, 16, src12, src13, src14, src15);
273  src += src_stride;
274 
275  ST_UB4(src0, src1, src2, src3, dst, 16);
276  dst += dst_stride;
277  ST_UB4(src4, src5, src6, src7, dst, 16);
278  dst += dst_stride;
279  ST_UB4(src8, src9, src10, src11, dst, 16);
280  dst += dst_stride;
281  ST_UB4(src12, src13, src14, src15, dst, 16);
282  dst += dst_stride;
283  }
284 }
285 
286 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
287  uint8_t *dst, int32_t dst_stride,
288  const int8_t *filter)
289 {
290  v16u8 mask0, mask1, mask2, mask3, out;
291  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292  v8i16 filt, out0, out1;
293 
294  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
295  src -= 3;
296 
297  /* rearranging filter */
298  filt = LD_SH(filter);
299  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
300 
301  mask1 = mask0 + 2;
302  mask2 = mask0 + 4;
303  mask3 = mask0 + 6;
304 
305  LD_SB4(src, src_stride, src0, src1, src2, src3);
306  XORI_B4_128_SB(src0, src1, src2, src3);
307  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308  mask3, filt0, filt1, filt2, filt3, out0, out1);
309  SRARI_H2_SH(out0, out1, 6);
310  SAT_SH2_SH(out0, out1, 7);
311  out = PCKEV_XORI128_UB(out0, out1);
312  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
313 }
314 
315 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
316  uint8_t *dst, int32_t dst_stride,
317  const int8_t *filter)
318 {
319  v16i8 filt0, filt1, filt2, filt3;
320  v16i8 src0, src1, src2, src3;
321  v16u8 mask0, mask1, mask2, mask3, out;
322  v8i16 filt, out0, out1, out2, out3;
323 
324  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
325  src -= 3;
326 
327  /* rearranging filter */
328  filt = LD_SH(filter);
329  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330 
331  mask1 = mask0 + 2;
332  mask2 = mask0 + 4;
333  mask3 = mask0 + 6;
334 
335  LD_SB4(src, src_stride, src0, src1, src2, src3);
336  XORI_B4_128_SB(src0, src1, src2, src3);
337  src += (4 * src_stride);
338  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339  mask3, filt0, filt1, filt2, filt3, out0, out1);
340  LD_SB4(src, src_stride, src0, src1, src2, src3);
341  XORI_B4_128_SB(src0, src1, src2, src3);
342  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343  mask3, filt0, filt1, filt2, filt3, out2, out3);
344  SRARI_H4_SH(out0, out1, out2, out3, 6);
345  SAT_SH4_SH(out0, out1, out2, out3, 7);
346  out = PCKEV_XORI128_UB(out0, out1);
347  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
348  out = PCKEV_XORI128_UB(out2, out3);
349  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
350 }
351 
352 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
353  uint8_t *dst, int32_t dst_stride,
354  const int8_t *filter)
355 {
356  v16u8 mask0, mask1, mask2, mask3, out;
357  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
358  v8i16 filt, out0, out1, out2, out3;
359 
360  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
361  src -= 3;
362 
363  /* rearranging filter */
364  filt = LD_SH(filter);
365  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
366 
367  mask1 = mask0 + 2;
368  mask2 = mask0 + 4;
369  mask3 = mask0 + 6;
370 
371  LD_SB4(src, src_stride, src0, src1, src2, src3);
372  XORI_B4_128_SB(src0, src1, src2, src3);
373  src += (4 * src_stride);
374  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
375  mask3, filt0, filt1, filt2, filt3, out0, out1);
376  LD_SB4(src, src_stride, src0, src1, src2, src3);
377  XORI_B4_128_SB(src0, src1, src2, src3);
378  src += (4 * src_stride);
379  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
380  mask3, filt0, filt1, filt2, filt3, out2, out3);
381  SRARI_H4_SH(out0, out1, out2, out3, 6);
382  SAT_SH4_SH(out0, out1, out2, out3, 7);
383  out = PCKEV_XORI128_UB(out0, out1);
384  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
385  out = PCKEV_XORI128_UB(out2, out3);
386  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387  dst += (8 * dst_stride);
388 
389  LD_SB4(src, src_stride, src0, src1, src2, src3);
390  XORI_B4_128_SB(src0, src1, src2, src3);
391  src += (4 * src_stride);
392  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
393  mask3, filt0, filt1, filt2, filt3, out0, out1);
394  LD_SB4(src, src_stride, src0, src1, src2, src3);
395  XORI_B4_128_SB(src0, src1, src2, src3);
396  src += (4 * src_stride);
397  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
398  mask3, filt0, filt1, filt2, filt3, out2, out3);
399 
400  SRARI_H4_SH(out0, out1, out2, out3, 6);
401  SAT_SH4_SH(out0, out1, out2, out3, 7);
402  out = PCKEV_XORI128_UB(out0, out1);
403  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
404  out = PCKEV_XORI128_UB(out2, out3);
405  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
406 }
407 
408 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
409  uint8_t *dst, int32_t dst_stride,
410  const int8_t *filter, int32_t height)
411 {
412  if (4 == height) {
413  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
414  } else if (8 == height) {
415  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
416  } else if (16 == height) {
417  common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
418  }
419 }
420 
421 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
422  uint8_t *dst, int32_t dst_stride,
423  const int8_t *filter, int32_t height)
424 {
425  uint32_t loop_cnt;
426  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
427  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429  v8i16 filt, out0, out1, out2, out3;
430 
431  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
432  src -= 3;
433 
434  /* rearranging filter */
435  filt = LD_SH(filter);
436  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
437 
438  mask1 = mask0 + 2;
439  mask2 = mask0 + 4;
440  mask3 = mask0 + 6;
441 
442  for (loop_cnt = (height >> 2); loop_cnt--;) {
443  LD_SB4(src, src_stride, src0, src1, src2, src3);
444  XORI_B4_128_SB(src0, src1, src2, src3);
445  src += (4 * src_stride);
446 
447  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
448  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450  out0, out1, out2, out3);
451  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
452  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454  out0, out1, out2, out3);
455  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
456  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458  out0, out1, out2, out3);
459  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
460  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462  out0, out1, out2, out3);
463 
464  SRARI_H4_SH(out0, out1, out2, out3, 6);
465  SAT_SH4_SH(out0, out1, out2, out3, 7);
466  tmp0 = PCKEV_XORI128_UB(out0, out1);
467  tmp1 = PCKEV_XORI128_UB(out2, out3);
468  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469  dst += (4 * dst_stride);
470  }
471 }
472 
473 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
474  uint8_t *dst, int32_t dst_stride,
475  const int8_t *filter, int32_t height)
476 {
477  uint32_t loop_cnt;
478  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479  v16u8 tmp0, tmp1, tmp2;
480  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
481  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482  v16i8 filt0, filt1, filt2, filt3;
483  v8i16 filt, out0, out1, out2, out3, out4, out5;
484 
485  mask00 = LD_UB(&ff_hevc_mask_arr[0]);
486  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
487 
488  src = src - 3;
489 
490  /* rearranging filter */
491  filt = LD_SH(filter);
492  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
493 
494  mask1 = mask00 + 2;
495  mask2 = mask00 + 4;
496  mask3 = mask00 + 6;
497  mask4 = mask0 + 2;
498  mask5 = mask0 + 4;
499  mask6 = mask0 + 6;
500 
501  for (loop_cnt = 4; loop_cnt--;) {
502  /* 8 width */
503  LD_SB4(src, src_stride, src0, src1, src2, src3);
504  /* 4 width */
505  LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
506 
507  XORI_B4_128_SB(src0, src1, src2, src3);
508  XORI_B4_128_SB(src4, src5, src6, src7);
509  src += (4 * src_stride);
510 
511  VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
512  VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
514  out1, out2, out3);
515  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
516  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
518  out1, out2, out3);
519  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
520  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
522  out1, out2, out3);
523  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
524  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
526  out1, out2, out3);
527 
528  /* 4 width */
529  VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
530  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
531  VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
532  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
533  VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
534  DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
535  VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
536  DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
537 
538  SRARI_H4_SH(out0, out1, out2, out3, 6);
539  SRARI_H2_SH(out4, out5, 6);
540  SAT_SH4_SH(out0, out1, out2, out3, 7);
541  SAT_SH2_SH(out4, out5, 7);
542  tmp0 = PCKEV_XORI128_UB(out0, out1);
543  tmp1 = PCKEV_XORI128_UB(out2, out3);
544  tmp2 = PCKEV_XORI128_UB(out4, out5);
545 
546  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547  ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548  dst += (4 * dst_stride);
549  }
550 }
551 
552 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
553  uint8_t *dst, int32_t dst_stride,
554  const int8_t *filter, int32_t height)
555 {
556  uint32_t loop_cnt;
557  v16u8 mask0, mask1, mask2, mask3, out;
558  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
559  v16i8 filt0, filt1, filt2, filt3;
560  v8i16 filt, out0, out1, out2, out3;
561 
562  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
563  src -= 3;
564 
565  /* rearranging filter */
566  filt = LD_SH(filter);
567  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
568 
569  mask1 = mask0 + 2;
570  mask2 = mask0 + 4;
571  mask3 = mask0 + 6;
572 
573  for (loop_cnt = (height >> 2); loop_cnt--;) {
574  LD_SB2(src, src_stride, src0, src2);
575  LD_SB2(src + 8, src_stride, src1, src3);
576  src += (2 * src_stride);
577 
578  LD_SB2(src, src_stride, src4, src6);
579  LD_SB2(src + 8, src_stride, src5, src7);
580  src += (2 * src_stride);
581 
582  XORI_B4_128_SB(src0, src1, src2, src3);
583  XORI_B4_128_SB(src4, src5, src6, src7);
584  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
585  mask3, filt0, filt1, filt2, filt3, out0,
586  out1, out2, out3);
587  SRARI_H4_SH(out0, out1, out2, out3, 6);
588  SAT_SH4_SH(out0, out1, out2, out3, 7);
589  out = PCKEV_XORI128_UB(out0, out1);
590  ST_UB(out, dst);
591  dst += dst_stride;
592  out = PCKEV_XORI128_UB(out2, out3);
593  ST_UB(out, dst);
594  dst += dst_stride;
595 
596  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
597  mask3, filt0, filt1, filt2, filt3, out0,
598  out1, out2, out3);
599  SRARI_H4_SH(out0, out1, out2, out3, 6);
600  SAT_SH4_SH(out0, out1, out2, out3, 7);
601  out = PCKEV_XORI128_UB(out0, out1);
602  ST_UB(out, dst);
603  dst += dst_stride;
604  out = PCKEV_XORI128_UB(out2, out3);
605  ST_UB(out, dst);
606  dst += dst_stride;
607  }
608 }
609 
610 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
611  uint8_t *dst, int32_t dst_stride,
612  const int8_t *filter, int32_t height)
613 {
614  uint32_t loop_cnt;
615  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
616  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
617  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
618  v16i8 vec11;
619  v8i16 out0, out1, out2, out3, out8, out9, filt;
620 
621  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
622  src -= 3;
623 
624  /* rearranging filter */
625  filt = LD_SH(filter);
626  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627 
628  mask1 = mask0 + 2;
629  mask2 = mask0 + 4;
630  mask3 = mask0 + 6;
631  mask4 = mask0 + 8;
632  mask5 = mask0 + 10;
633  mask6 = mask0 + 12;
634  mask7 = mask0 + 14;
635 
636  for (loop_cnt = 16; loop_cnt--;) {
637  LD_SB2(src, src_stride, src0, src2);
638  LD_SB2(src + 16, src_stride, src1, src3);
639  XORI_B4_128_SB(src0, src1, src2, src3);
640  src += (2 * src_stride);
641  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
642  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
643  VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
644  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
645  out8, out2, out9);
646  DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
647  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
648  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
649  VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
650  DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651  out0, out8, out2, out9);
652  DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
653  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
654  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
655  VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
656  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657  out0, out8, out2, out9);
658  DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
659  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
660  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
661  VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
662  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663  out0, out8, out2, out9);
664  DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
665  SRARI_H4_SH(out0, out8, out2, out9, 6);
666  SRARI_H2_SH(out1, out3, 6);
667  SAT_SH4_SH(out0, out8, out2, out9, 7);
668  SAT_SH2_SH(out1, out3, 7);
669  out = PCKEV_XORI128_UB(out8, out9);
670  ST_D2(out, 0, 1, dst + 16, dst_stride);
671  out = PCKEV_XORI128_UB(out0, out1);
672  ST_UB(out, dst);
673  dst += dst_stride;
674  out = PCKEV_XORI128_UB(out2, out3);
675  ST_UB(out, dst);
676  dst += dst_stride;
677  }
678 }
679 
680 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
681  uint8_t *dst, int32_t dst_stride,
682  const int8_t *filter, int32_t height)
683 {
684  uint32_t loop_cnt;
685  v16u8 mask0, mask1, mask2, mask3, out;
686  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
687  v16i8 filt0, filt1, filt2, filt3;
688  v8i16 filt, out0, out1, out2, out3;
689 
690  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
691  src -= 3;
692 
693  /* rearranging filter */
694  filt = LD_SH(filter);
695  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
696 
697  mask1 = mask0 + 2;
698  mask2 = mask0 + 4;
699  mask3 = mask0 + 6;
700 
701  for (loop_cnt = (height >> 1); loop_cnt--;) {
702  src0 = LD_SB(src);
703  src1 = LD_SB(src + 8);
704  src2 = LD_SB(src + 16);
705  src3 = LD_SB(src + 24);
706  src += src_stride;
707  XORI_B4_128_SB(src0, src1, src2, src3);
708 
709  src4 = LD_SB(src);
710  src5 = LD_SB(src + 8);
711  src6 = LD_SB(src + 16);
712  src7 = LD_SB(src + 24);
713  src += src_stride;
714  XORI_B4_128_SB(src4, src5, src6, src7);
715 
716  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
717  mask3, filt0, filt1, filt2, filt3, out0,
718  out1, out2, out3);
719  SRARI_H4_SH(out0, out1, out2, out3, 6);
720  SAT_SH4_SH(out0, out1, out2, out3, 7);
721 
722  out = PCKEV_XORI128_UB(out0, out1);
723  ST_UB(out, dst);
724  out = PCKEV_XORI128_UB(out2, out3);
725  ST_UB(out, dst + 16);
726  dst += dst_stride;
727 
728  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
729  mask3, filt0, filt1, filt2, filt3, out0,
730  out1, out2, out3);
731  SRARI_H4_SH(out0, out1, out2, out3, 6);
732  SAT_SH4_SH(out0, out1, out2, out3, 7);
733  out = PCKEV_XORI128_UB(out0, out1);
734  ST_UB(out, dst);
735  out = PCKEV_XORI128_UB(out2, out3);
736  ST_UB(out, dst + 16);
737  dst += dst_stride;
738  }
739 }
740 
741 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
742  uint8_t *dst, int32_t dst_stride,
743  const int8_t *filter, int32_t height)
744 {
745  uint32_t loop_cnt;
746  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
747  v16i8 src4;
748  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
749  v8i16 filt, out0, out1, out2, out3;
750 
751  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
752  src -= 3;
753 
754  /* rearranging filter */
755  filt = LD_SH(filter);
756  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
757 
758  mask1 = mask0 + 2;
759  mask2 = mask0 + 4;
760  mask3 = mask0 + 6;
761  mask4 = mask0 + 8;
762  mask5 = mask0 + 10;
763  mask6 = mask0 + 12;
764  mask7 = mask0 + 14;
765 
766  for (loop_cnt = 64; loop_cnt--;) {
767  src0 = LD_SB(src);
768  src1 = LD_SB(src + 8);
769  src2 = LD_SB(src + 16);
770  src3 = LD_SB(src + 32);
771  src4 = LD_SB(src + 40);
772  src += src_stride;
773 
774  XORI_B4_128_SB(src0, src1, src2, src3);
775  src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
776 
777  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
778  vec0, vec1, vec2);
779  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
780  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
781  vec0, vec1, vec2);
782  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
783  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
784  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
785  vec0, vec1, vec2);
786  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
787  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
788 
789  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
790  vec0, vec1, vec2);
791  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
792  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
793 
794  SRARI_H2_SH(out0, out1, 6);
795  out3 = __msa_srari_h(out2, 6);
796  SAT_SH3_SH(out0, out1, out3, 7);
797  out = PCKEV_XORI128_UB(out0, out1);
798  ST_UB(out, dst);
799 
800  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
801  vec0, vec1, vec2);
802  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
804  vec0, vec1, vec2);
805  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
806  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
808  vec0, vec1, vec2);
809  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
810  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
812  vec0, vec1, vec2);
813  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
815 
816  SRARI_H2_SH(out0, out1, 6);
817  out2 = __msa_srari_h(out2, 6);
818  SAT_SH3_SH(out0, out1, out2, 7);
819  out = PCKEV_XORI128_UB(out3, out0);
820  ST_UB(out, dst + 16);
821  out = PCKEV_XORI128_UB(out1, out2);
822  ST_UB(out, dst + 32);
823  dst += dst_stride;
824  }
825 }
826 
827 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
828  uint8_t *dst, int32_t dst_stride,
829  const int8_t *filter, int32_t height)
830 {
831  int32_t loop_cnt;
832  v16u8 mask0, mask1, mask2, mask3, out;
833  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
834  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835  v16i8 filt0, filt1, filt2, filt3;
836  v8i16 res0, res1, res2, res3, filt;
837 
838  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
839  src -= 3;
840 
841  /* rearranging filter */
842  filt = LD_SH(filter);
843  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
844 
845  mask1 = mask0 + 2;
846  mask2 = mask0 + 4;
847  mask3 = mask0 + 6;
848 
849  for (loop_cnt = height; loop_cnt--;) {
850  LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
851  src += src_stride;
852 
853  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
854 
855  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
856  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
858  res1, res2, res3);
859  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
860  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
862  res1, res2, res3);
863  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
864  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
866  res1, res2, res3);
867  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
868  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
870  res1, res2, res3);
871 
872  SRARI_H4_SH(res0, res1, res2, res3, 6);
873  SAT_SH4_SH(res0, res1, res2, res3, 7);
874  out = PCKEV_XORI128_UB(res0, res1);
875  ST_UB(out, dst);
876  out = PCKEV_XORI128_UB(res2, res3);
877  ST_UB(out, dst + 16);
878 
879  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
882  res1, res2, res3);
883  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
886  res1, res2, res3);
887  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
890  res1, res2, res3);
891  VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892  VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
894  res1, res2, res3);
895 
896  SRARI_H4_SH(res0, res1, res2, res3, 6);
897  SAT_SH4_SH(res0, res1, res2, res3, 7);
898  out = PCKEV_XORI128_UB(res0, res1);
899  ST_UB(out, dst + 32);
900  out = PCKEV_XORI128_UB(res2, res3);
901  ST_UB(out, dst + 48);
902  dst += dst_stride;
903  }
904 }
905 
906 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
907  uint8_t *dst, int32_t dst_stride,
908  const int8_t *filter, int32_t height)
909 {
910  uint32_t loop_cnt;
911  v16u8 out0, out1;
912  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
913  v16i8 src11, src12, src13, src14;
914  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
915  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
916  v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
917  v16i8 src10998, filt0, filt1, filt2, filt3;
918  v8i16 filt, out10, out32, out54, out76;
919 
920  src -= (3 * src_stride);
921 
922  filt = LD_SH(filter);
923  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
924 
925  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
926  src += (7 * src_stride);
927 
928  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
929  src54_r, src21_r);
930  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
931  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
932  src4332, src6554);
933  XORI_B3_128_SB(src2110, src4332, src6554);
934 
935  for (loop_cnt = (height >> 3); loop_cnt--;) {
936  LD_SB4(src, src_stride, src7, src8, src9, src10);
937  src += (4 * src_stride);
938  LD_SB4(src, src_stride, src11, src12, src13, src14);
939  src += (4 * src_stride);
940 
941  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
942  src87_r, src98_r, src109_r);
943  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
944  src1110_r, src1211_r, src1312_r, src1413_r);
945  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
946  ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
947  src12111110, src14131312);
948  XORI_B2_128_SB(src8776, src10998);
949  XORI_B2_128_SB(src12111110, src14131312);
950 
951  DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
952  DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
953  DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
954  DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
955  DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
956  DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
957  DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
958  DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
959  SRARI_H2_SH(out10, out32, 6);
960  SRARI_H2_SH(out54, out76, 6);
961  SAT_SH2_SH(out10, out32, 7);
962  SAT_SH2_SH(out54, out76, 7);
963  out0 = PCKEV_XORI128_UB(out10, out32);
964  out1 = PCKEV_XORI128_UB(out54, out76);
965  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
966  dst += (8 * dst_stride);
967 
968  src2110 = src10998;
969  src4332 = src12111110;
970  src6554 = src14131312;
971  src6 = src14;
972  }
973 }
974 
975 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
976  uint8_t *dst, int32_t dst_stride,
977  const int8_t *filter, int32_t height)
978 {
979  uint32_t loop_cnt;
980  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
981  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
982  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
983  v16u8 tmp0, tmp1;
984  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
985 
986  src -= (3 * src_stride);
987 
988  filt = LD_SH(filter);
989  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
990 
991  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
992  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
993  src += (7 * src_stride);
994  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
995  src54_r, src21_r);
996  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
997 
998  for (loop_cnt = (height >> 2); loop_cnt--;) {
999  LD_SB4(src, src_stride, src7, src8, src9, src10);
1000  XORI_B4_128_SB(src7, src8, src9, src10);
1001  src += (4 * src_stride);
1002 
1003  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1004  src87_r, src98_r, src109_r);
1005  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1006  filt0, out0_r, out1_r, out2_r, out3_r);
1007  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1008  filt1, out0_r, out1_r, out2_r, out3_r);
1009  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1010  filt2, out0_r, out1_r, out2_r, out3_r);
1011  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1012  filt3, out0_r, out1_r, out2_r, out3_r);
1013  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1014  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1015  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1016  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1017  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1018  dst += (4 * dst_stride);
1019 
1020  src10_r = src54_r;
1021  src32_r = src76_r;
1022  src54_r = src98_r;
1023  src21_r = src65_r;
1024  src43_r = src87_r;
1025  src65_r = src109_r;
1026  src6 = src10;
1027  }
1028 }
1029 
1030 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1031  uint8_t *dst, int32_t dst_stride,
1032  const int8_t *filter, int32_t height)
1033 {
1034  uint32_t loop_cnt;
1035  uint32_t out2, out3;
1036  uint64_t out0, out1;
1037  v16u8 tmp0, tmp1, tmp2, tmp3;
1038  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1039  v16i8 filt0, filt1, filt2, filt3;
1040  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1042  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1043  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1044 
1045  src -= (3 * src_stride);
1046 
1047  filt = LD_SH(filter);
1048  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1049 
1050  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1051  src += (7 * src_stride);
1052 
1053  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1054 
1055  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1056  src54_r, src21_r);
1057  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1058  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1059  src54_l, src21_l);
1060  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1061 
1062  for (loop_cnt = 4; loop_cnt--;) {
1063  LD_SB4(src, src_stride, src7, src8, src9, src10);
1064  XORI_B4_128_SB(src7, src8, src9, src10);
1065  src += (4 * src_stride);
1066 
1067  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1068  src87_r, src98_r, src109_r);
1069  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1070  src87_l, src98_l, src109_l);
1071  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1072  filt1, filt2, filt3);
1073  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1074  filt1, filt2, filt3);
1075  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1076  filt1, filt2, filt3);
1077  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1078  filt1, filt2, filt3);
1079  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1080  filt1, filt2, filt3);
1081  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1082  filt1, filt2, filt3);
1083  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1084  filt1, filt2, filt3);
1085  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1086  filt1, filt2, filt3);
1087  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1088  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1089  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1090  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1091  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1092  out3_r, tmp0, tmp1, tmp2, tmp3);
1093  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1094 
1095  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1096  out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1097  out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1098  out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1099  SD(out0, dst);
1100  SW(out2, (dst + 8));
1101  dst += dst_stride;
1102  SD(out1, dst);
1103  SW(out3, (dst + 8));
1104  dst += dst_stride;
1105  out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1106  out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1107  out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1108  out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1109  SD(out0, dst);
1110  SW(out2, (dst + 8));
1111  dst += dst_stride;
1112  SD(out1, dst);
1113  SW(out3, (dst + 8));
1114  dst += dst_stride;
1115 
1116  src10_r = src54_r;
1117  src32_r = src76_r;
1118  src54_r = src98_r;
1119  src21_r = src65_r;
1120  src43_r = src87_r;
1121  src65_r = src109_r;
1122  src10_l = src54_l;
1123  src32_l = src76_l;
1124  src54_l = src98_l;
1125  src21_l = src65_l;
1126  src43_l = src87_l;
1127  src65_l = src109_l;
1128  src6 = src10;
1129  }
1130 }
1131 
1132 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1133  uint8_t *dst, int32_t dst_stride,
1134  const int8_t *filter, int32_t height)
1135 {
1136  uint32_t loop_cnt;
1137  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1138  v16i8 filt0, filt1, filt2, filt3;
1139  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1140  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1141  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1142  v16u8 tmp0, tmp1, tmp2, tmp3;
1143  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1144 
1145  src -= (3 * src_stride);
1146 
1147  filt = LD_SH(filter);
1148  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1149 
1150  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1151  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1152  src += (7 * src_stride);
1153  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1154  src54_r, src21_r);
1155  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1156  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1157  src54_l, src21_l);
1158  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1159 
1160  for (loop_cnt = (height >> 2); loop_cnt--;) {
1161  LD_SB4(src, src_stride, src7, src8, src9, src10);
1162  XORI_B4_128_SB(src7, src8, src9, src10);
1163  src += (4 * src_stride);
1164 
1165  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1166  src87_r, src98_r, src109_r);
1167  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1168  src87_l, src98_l, src109_l);
1169  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1170  filt1, filt2, filt3);
1171  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1172  filt1, filt2, filt3);
1173  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1174  filt1, filt2, filt3);
1175  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1176  filt1, filt2, filt3);
1177  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1178  filt1, filt2, filt3);
1179  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1180  filt1, filt2, filt3);
1181  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1182  filt1, filt2, filt3);
1183  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1184  filt1, filt2, filt3);
1185  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1186  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1187  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1188  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1189  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1190  out3_r, tmp0, tmp1, tmp2, tmp3);
1191  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1192  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1193  dst += (4 * dst_stride);
1194 
1195  src10_r = src54_r;
1196  src32_r = src76_r;
1197  src54_r = src98_r;
1198  src21_r = src65_r;
1199  src43_r = src87_r;
1200  src65_r = src109_r;
1201  src10_l = src54_l;
1202  src32_l = src76_l;
1203  src54_l = src98_l;
1204  src21_l = src65_l;
1205  src43_l = src87_l;
1206  src65_l = src109_l;
1207  src6 = src10;
1208  }
1209 }
1210 
1212  uint8_t *dst, int32_t dst_stride,
1213  const int8_t *filter, int32_t height,
1214  int32_t width)
1215 {
1216  uint8_t *src_tmp;
1217  uint8_t *dst_tmp;
1218  uint32_t loop_cnt, cnt;
1219  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1220  v16i8 filt0, filt1, filt2, filt3;
1221  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1222  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1223  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1224  v16u8 tmp0, tmp1, tmp2, tmp3;
1225  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1226 
1227  src -= (3 * src_stride);
1228 
1229  filt = LD_SH(filter);
1230  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1231 
1232  for (cnt = (width >> 4); cnt--;) {
1233  src_tmp = src;
1234  dst_tmp = dst;
1235 
1236  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1237  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1238  src_tmp += (7 * src_stride);
1239  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1240  src32_r, src54_r, src21_r);
1241  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1242  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1243  src32_l, src54_l, src21_l);
1244  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1245 
1246  for (loop_cnt = (height >> 2); loop_cnt--;) {
1247  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1248  XORI_B4_128_SB(src7, src8, src9, src10);
1249  src_tmp += (4 * src_stride);
1250  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1251  src87_r, src98_r, src109_r);
1252  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1253  src87_l, src98_l, src109_l);
1254  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1255  filt0, filt1, filt2, filt3);
1256  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1257  filt0, filt1, filt2, filt3);
1258  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1259  filt0, filt1, filt2, filt3);
1260  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1261  filt0, filt1, filt2, filt3);
1262  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1263  filt0, filt1, filt2, filt3);
1264  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1265  filt0, filt1, filt2, filt3);
1266  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1267  filt0, filt1, filt2, filt3);
1268  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1269  filt0, filt1, filt2, filt3);
1270  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1271  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1272  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1273  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1274  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1275  out3_r, tmp0, tmp1, tmp2, tmp3);
1276  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1277  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1278  dst_tmp += (4 * dst_stride);
1279 
1280  src10_r = src54_r;
1281  src32_r = src76_r;
1282  src54_r = src98_r;
1283  src21_r = src65_r;
1284  src43_r = src87_r;
1285  src65_r = src109_r;
1286  src10_l = src54_l;
1287  src32_l = src76_l;
1288  src54_l = src98_l;
1289  src21_l = src65_l;
1290  src43_l = src87_l;
1291  src65_l = src109_l;
1292  src6 = src10;
1293  }
1294 
1295  src += 16;
1296  dst += 16;
1297  }
1298 }
1299 
1300 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1301  uint8_t *dst, int32_t dst_stride,
1302  const int8_t *filter, int32_t height)
1303 {
1304  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1305  16);
1306 
1307  common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1308  height);
1309 }
1310 
1311 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1312  uint8_t *dst, int32_t dst_stride,
1313  const int8_t *filter, int32_t height)
1314 {
1315  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1316  32);
1317 }
1318 
1319 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1320  uint8_t *dst, int32_t dst_stride,
1321  const int8_t *filter, int32_t height)
1322 {
1323  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1324  48);
1325 }
1326 
1327 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1328  uint8_t *dst, int32_t dst_stride,
1329  const int8_t *filter, int32_t height)
1330 {
1331  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1332  64);
1333 }
1334 
1336  int32_t src_stride,
1337  uint8_t *dst,
1338  int32_t dst_stride,
1339  const int8_t *filter_x,
1340  const int8_t *filter_y,
1341  int32_t height)
1342 {
1343  uint32_t loop_cnt;
1344  v16u8 out0, out1;
1345  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346  v16i8 src9, src10, src11, src12, src13, src14;
1347  v8i16 filt0, filt1, filt2, filt3;
1348  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1349  v16i8 mask1, mask2, mask3;
1350  v8i16 filter_vec;
1351  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1352  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1353  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1354  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1355  v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1356  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1357  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1358 
1359  src -= ((3 * src_stride) + 3);
1360  filter_vec = LD_SH(filter_x);
1361  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1362 
1363  filter_vec = LD_SH(filter_y);
1364  UNPCK_R_SB_SH(filter_vec, filter_vec);
1365 
1366  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1367 
1368  mask1 = mask0 + 2;
1369  mask2 = mask0 + 4;
1370  mask3 = mask0 + 6;
1371 
1372  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1373  src += (7 * src_stride);
1374  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1375 
1376  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1377  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1378  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1379  vec8, vec9, vec10, vec11);
1380  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1381  vec12, vec13, vec14, vec15);
1382 
1383  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1384  filt3);
1385  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1386  filt3);
1387  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1388  filt3);
1389  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1390  filt3);
1391 
1392  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1393  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1394  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1395 
1396  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1397 
1398  for (loop_cnt = height >> 3; loop_cnt--;) {
1399  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1400  src14);
1401  src += (8 * src_stride);
1402  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1403 
1404  VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1405  vec0, vec1, vec2, vec3);
1406  VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1407  vec4, vec5, vec6, vec7);
1408  VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1409  vec8, vec9, vec10, vec11);
1410  VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1411  vec12, vec13, vec14, vec15);
1412 
1413  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1414  filt3);
1415  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1416  filt3);
1417  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1418  filt2, filt3);
1419  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1420  filt2, filt3);
1421 
1422  dst76_r = __msa_ilvr_h(dst117, dst66);
1423  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1424  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1425  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1426  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1427  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1428 
1429  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1430  filt_h1, filt_h2, filt_h3);
1431  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1432  filt_h1, filt_h2, filt_h3);
1433  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1434  filt_h1, filt_h2, filt_h3);
1435  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1436  filt_h1, filt_h2, filt_h3);
1437  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1438  filt_h1, filt_h2, filt_h3);
1439  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1440  filt_h1, filt_h2, filt_h3);
1441  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1442  filt_h1, filt_h2, filt_h3);
1443  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1444  filt_h0, filt_h1, filt_h2, filt_h3);
1445 
1446  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1447  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1448  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1449  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1450  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1451  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1452  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1453  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1454  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1455  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1456  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1457  dst += (8 * dst_stride);
1458 
1459  dst10_r = dst98_r;
1460  dst32_r = dst1110_r;
1461  dst54_r = dst1312_r;
1462  dst21_r = dst109_r;
1463  dst43_r = dst1211_r;
1464  dst65_r = dst1413_r;
1465  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1466  }
1467 }
1468 
1470  int32_t src_stride,
1471  uint8_t *dst,
1472  int32_t dst_stride,
1473  const int8_t *filter_x,
1474  const int8_t *filter_y,
1476 {
1477  uint32_t loop_cnt, cnt;
1478  uint8_t *src_tmp;
1479  uint8_t *dst_tmp;
1480  v16u8 out;
1481  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1482  v8i16 filt0, filt1, filt2, filt3;
1483  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1484  v16i8 mask1, mask2, mask3;
1485  v8i16 filter_vec;
1486  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1487  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1488  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1489  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1490  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1491  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1492  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1493  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1494  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1495 
1496  src -= ((3 * src_stride) + 3);
1497 
1498  filter_vec = LD_SH(filter_x);
1499  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1500 
1501  filter_vec = LD_SH(filter_y);
1502  UNPCK_R_SB_SH(filter_vec, filter_vec);
1503 
1504  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1505 
1506  mask1 = mask0 + 2;
1507  mask2 = mask0 + 4;
1508  mask3 = mask0 + 6;
1509 
1510  for (cnt = width >> 3; cnt--;) {
1511  src_tmp = src;
1512  dst_tmp = dst;
1513 
1514  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1515  src_tmp += (7 * src_stride);
1516  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1517 
1518  /* row 0 row 1 row 2 row 3 */
1519  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1520  vec0, vec1, vec2, vec3);
1521  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1522  vec4, vec5, vec6, vec7);
1523  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1524  vec8, vec9, vec10, vec11);
1525  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1526  vec12, vec13, vec14, vec15);
1527  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1528  filt3);
1529  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1530  filt3);
1531  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1532  filt3);
1533  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1534  filt2, filt3);
1535 
1536  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1537  vec0, vec1, vec2, vec3);
1538  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1539  vec4, vec5, vec6, vec7);
1540  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1541  vec8, vec9, vec10, vec11);
1542  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1543  filt3);
1544  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1545  filt3);
1546  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1547  filt3);
1548 
1549  for (loop_cnt = height >> 1; loop_cnt--;) {
1550  LD_SB2(src_tmp, src_stride, src7, src8);
1551  XORI_B2_128_SB(src7, src8);
1552  src_tmp += 2 * src_stride;
1553 
1554  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1555  dst10_r, dst32_r, dst54_r, dst21_r);
1556  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1557  dst10_l, dst32_l, dst54_l, dst21_l);
1558  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1559  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1560 
1561  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1562  vec0, vec1, vec2, vec3);
1563  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1564  filt2, filt3);
1565 
1566  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1567  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1568  filt_h0, filt_h1, filt_h2, filt_h3);
1569  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1570  filt_h0, filt_h1, filt_h2, filt_h3);
1571  dst0_r >>= 6;
1572  dst0_l >>= 6;
1573 
1574  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1575  vec0, vec1, vec2, vec3);
1576  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1577  filt2, filt3);
1578 
1579  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1580  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1581  filt_h0, filt_h1, filt_h2, filt_h3);
1582  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1583  filt_h0, filt_h1, filt_h2, filt_h3);
1584  dst1_r >>= 6;
1585  dst1_l >>= 6;
1586  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1587  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1588 
1589  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1590  out = PCKEV_XORI128_UB(dst0, dst1);
1591  ST_D2(out, 0, 1, dst_tmp, dst_stride);
1592  dst_tmp += (2 * dst_stride);
1593 
1594  dst0 = dst2;
1595  dst1 = dst3;
1596  dst2 = dst4;
1597  dst3 = dst5;
1598  dst4 = dst6;
1599  dst5 = dst7;
1600  dst6 = dst8;
1601  }
1602 
1603  src += 8;
1604  dst += 8;
1605  }
1606 }
1607 
1609  int32_t src_stride,
1610  uint8_t *dst,
1611  int32_t dst_stride,
1612  const int8_t *filter_x,
1613  const int8_t *filter_y,
1614  int32_t height)
1615 {
1616  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1617  filter_x, filter_y, height, 8);
1618 }
1619 
1621  int32_t src_stride,
1622  uint8_t *dst,
1623  int32_t dst_stride,
1624  const int8_t *filter_x,
1625  const int8_t *filter_y,
1626  int32_t height)
1627 {
1628  uint32_t loop_cnt;
1629  uint8_t *src_tmp, *dst_tmp;
1630  v16u8 out0, out1;
1631  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1632  v16i8 src11, src12, src13, src14;
1633  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1634  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1635  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1636  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1637  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1638  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1639  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1640  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1641  v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1642  v8i16 dst1413_r, dst87_l, filter_vec;
1643  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1644  v4i32 dst0_l, dst1_l;
1645 
1646  src -= ((3 * src_stride) + 3);
1647 
1648  filter_vec = LD_SH(filter_x);
1649  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1650 
1651  filter_vec = LD_SH(filter_y);
1652  UNPCK_R_SB_SH(filter_vec, filter_vec);
1653 
1654  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1655 
1656  mask0 = LD_SB(ff_hevc_mask_arr);
1657  mask1 = mask0 + 2;
1658  mask2 = mask0 + 4;
1659  mask3 = mask0 + 6;
1660 
1661  src_tmp = src;
1662  dst_tmp = dst;
1663 
1664  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1665  src_tmp += (7 * src_stride);
1666  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1667 
1668  /* row 0 row 1 row 2 row 3 */
1669  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1670  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1671  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1672  vec11);
1673  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1674  vec15);
1675  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1676  filt3);
1677  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1678  filt3);
1679  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1680  filt3);
1681  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1682  filt2, filt3);
1683 
1684  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1685  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1686  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1687  vec11);
1688  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1689  filt3);
1690  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1691  filt3);
1692  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1693  filt3);
1694 
1695  for (loop_cnt = 8; loop_cnt--;) {
1696  LD_SB2(src_tmp, src_stride, src7, src8);
1697  XORI_B2_128_SB(src7, src8);
1698  src_tmp += 2 * src_stride;
1699 
1700  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1701  dst32_r, dst54_r, dst21_r);
1702  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1703  dst32_l, dst54_l, dst21_l);
1704  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1705  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1706 
1707  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1708  vec3);
1709  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1710  filt3);
1711 
1712  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1713  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1714  filt_h0, filt_h1, filt_h2, filt_h3);
1715  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1716  filt_h0, filt_h1, filt_h2, filt_h3);
1717  dst0_r >>= 6;
1718  dst0_l >>= 6;
1719 
1720  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1721  vec3);
1722  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1723  filt3);
1724 
1725  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1726  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1727  filt_h0, filt_h1, filt_h2, filt_h3);
1728  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1729  filt_h0, filt_h1, filt_h2, filt_h3);
1730  dst1_r >>= 6;
1731  dst1_l >>= 6;
1732  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1733  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1734 
1735  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1736  out0 = PCKEV_XORI128_UB(dst0, dst1);
1737  ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1738  dst_tmp += (2 * dst_stride);
1739 
1740  dst0 = dst2;
1741  dst1 = dst3;
1742  dst2 = dst4;
1743  dst3 = dst5;
1744  dst4 = dst6;
1745  dst5 = dst7;
1746  dst6 = dst8;
1747  }
1748 
1749  src += 8;
1750  dst += 8;
1751 
1752  mask4 = LD_SB(ff_hevc_mask_arr + 16);
1753  mask5 = mask4 + 2;
1754  mask6 = mask4 + 4;
1755  mask7 = mask4 + 6;
1756 
1757  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1758  src += (7 * src_stride);
1759  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1760 
1761  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1762  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1763  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1764  vec11);
1765  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1766  vec15);
1767 
1768  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1769  filt3);
1770  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1771  filt3);
1772  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1773  filt3);
1774  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1775  filt3);
1776 
1777  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1778  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1779  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1780 
1781  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1782 
1783  for (loop_cnt = 2; loop_cnt--;) {
1784  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1785  src14);
1786  src += (8 * src_stride);
1787  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1788 
1789  VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1790  vec3);
1791  VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1792  vec7);
1793  VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1794  vec11);
1795  VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1796  vec14, vec15);
1797 
1798  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1799  filt3);
1800  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1801  filt3);
1802  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1803  filt2, filt3);
1804  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1805  filt2, filt3);
1806 
1807  dst76_r = __msa_ilvr_h(dst117, dst66);
1808  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1809  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1810  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1811  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1812  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1813 
1814  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1815  filt_h1, filt_h2, filt_h3);
1816  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1817  filt_h1, filt_h2, filt_h3);
1818  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1819  filt_h1, filt_h2, filt_h3);
1820  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1821  filt_h1, filt_h2, filt_h3);
1822  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1823  filt_h1, filt_h2, filt_h3);
1824  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1825  filt_h1, filt_h2, filt_h3);
1826  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1827  filt_h1, filt_h2, filt_h3);
1828  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1829  filt_h0, filt_h1, filt_h2, filt_h3);
1830 
1831  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1832  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1833  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1834  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1835  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1836  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1837  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1838  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1839  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1840  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1841  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1842  dst += (8 * dst_stride);
1843 
1844  dst10_r = dst98_r;
1845  dst32_r = dst1110_r;
1846  dst54_r = dst1312_r;
1847  dst21_r = dst109_r;
1848  dst43_r = dst1211_r;
1849  dst65_r = dst1413_r;
1850  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1851  }
1852 }
1853 
1855  int32_t src_stride,
1856  uint8_t *dst,
1857  int32_t dst_stride,
1858  const int8_t *filter_x,
1859  const int8_t *filter_y,
1860  int32_t height)
1861 {
1862  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1863  filter_x, filter_y, height, 16);
1864 }
1865 
1867  int32_t src_stride,
1868  uint8_t *dst,
1869  int32_t dst_stride,
1870  const int8_t *filter_x,
1871  const int8_t *filter_y,
1872  int32_t height)
1873 {
1874  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1875  filter_x, filter_y, height, 24);
1876 }
1877 
1879  int32_t src_stride,
1880  uint8_t *dst,
1881  int32_t dst_stride,
1882  const int8_t *filter_x,
1883  const int8_t *filter_y,
1884  int32_t height)
1885 {
1886  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1887  filter_x, filter_y, height, 32);
1888 }
1889 
1891  int32_t src_stride,
1892  uint8_t *dst,
1893  int32_t dst_stride,
1894  const int8_t *filter_x,
1895  const int8_t *filter_y,
1896  int32_t height)
1897 {
1898  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1899  filter_x, filter_y, height, 48);
1900 }
1901 
1903  int32_t src_stride,
1904  uint8_t *dst,
1905  int32_t dst_stride,
1906  const int8_t *filter_x,
1907  const int8_t *filter_y,
1908  int32_t height)
1909 {
1910  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1911  filter_x, filter_y, height, 64);
1912 }
1913 
1914 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1915  uint8_t *dst, int32_t dst_stride,
1916  const int8_t *filter)
1917 {
1918  v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1919  v16u8 out;
1920  v8i16 filt, res0;
1921 
1922  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1923  src -= 1;
1924 
1925  /* rearranging filter */
1926  filt = LD_SH(filter);
1927  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1928 
1929  mask1 = mask0 + 2;
1930 
1931  LD_SB2(src, src_stride, src0, src1);
1933  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1934  res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1935  res0 = __msa_srari_h(res0, 6);
1936  res0 = __msa_sat_s_h(res0, 7);
1937  out = PCKEV_XORI128_UB(res0, res0);
1938  ST_W2(out, 0, 1, dst, dst_stride);
1939 }
1940 
1941 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1942  uint8_t *dst, int32_t dst_stride,
1943  const int8_t *filter)
1944 {
1945  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1946  v8i16 filt, out0, out1;
1947  v16u8 out;
1948 
1949  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1950  src -= 1;
1951 
1952  /* rearranging filter */
1953  filt = LD_SH(filter);
1954  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1955 
1956  mask1 = mask0 + 2;
1957 
1958  LD_SB4(src, src_stride, src0, src1, src2, src3);
1959  XORI_B4_128_SB(src0, src1, src2, src3);
1960  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1961  filt0, filt1, out0, out1);
1962  SRARI_H2_SH(out0, out1, 6);
1963  SAT_SH2_SH(out0, out1, 7);
1964  out = PCKEV_XORI128_UB(out0, out1);
1965  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1966 }
1967 
1968 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1969  uint8_t *dst, int32_t dst_stride,
1970  const int8_t *filter)
1971 {
1972  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1973  v16u8 out;
1974  v8i16 filt, out0, out1, out2, out3;
1975 
1976  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1977  src -= 1;
1978 
1979  /* rearranging filter */
1980  filt = LD_SH(filter);
1981  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1982 
1983  mask1 = mask0 + 2;
1984 
1985  LD_SB4(src, src_stride, src0, src1, src2, src3);
1986  src += (4 * src_stride);
1987 
1988  XORI_B4_128_SB(src0, src1, src2, src3);
1989  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1990  filt0, filt1, out0, out1);
1991  LD_SB4(src, src_stride, src0, src1, src2, src3);
1992  XORI_B4_128_SB(src0, src1, src2, src3);
1993  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1994  filt0, filt1, out2, out3);
1995  SRARI_H4_SH(out0, out1, out2, out3, 6);
1996  SAT_SH4_SH(out0, out1, out2, out3, 7);
1997  out = PCKEV_XORI128_UB(out0, out1);
1998  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1999  out = PCKEV_XORI128_UB(out2, out3);
2000  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2001 }
2002 
2003 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2004  uint8_t *dst, int32_t dst_stride,
2005  const int8_t *filter)
2006 {
2007  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2008  v16i8 filt0, filt1, mask0, mask1;
2009  v16u8 out;
2010  v8i16 filt, out0, out1, out2, out3;
2011 
2012  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2013  src -= 1;
2014 
2015  /* rearranging filter */
2016  filt = LD_SH(filter);
2017  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2018 
2019  mask1 = mask0 + 2;
2020 
2021  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2022  src += (8 * src_stride);
2023  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2024  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2025  filt0, filt1, out0, out1);
2026  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2027  filt0, filt1, out2, out3);
2028  SRARI_H4_SH(out0, out1, out2, out3, 6);
2029  SAT_SH4_SH(out0, out1, out2, out3, 7);
2030  out = PCKEV_XORI128_UB(out0, out1);
2031  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2032  out = PCKEV_XORI128_UB(out2, out3);
2033  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2034  dst += (8 * dst_stride);
2035 
2036  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2037  src += (8 * src_stride);
2038  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2039  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2040  filt0, filt1, out0, out1);
2041  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2042  filt0, filt1, out2, out3);
2043  SRARI_H4_SH(out0, out1, out2, out3, 6);
2044  SAT_SH4_SH(out0, out1, out2, out3, 7);
2045  out = PCKEV_XORI128_UB(out0, out1);
2046  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2047  out = PCKEV_XORI128_UB(out2, out3);
2048  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2049 }
2050 
2051 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2052  uint8_t *dst, int32_t dst_stride,
2053  const int8_t *filter, int32_t height)
2054 {
2055  if (2 == height) {
2056  common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2057  } else if (4 == height) {
2058  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2059  } else if (8 == height) {
2060  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2061  } else if (16 == height) {
2062  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2063  }
2064 }
2065 
2066 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2067  uint8_t *dst, int32_t dst_stride,
2068  const int8_t *filter, int32_t height)
2069 {
2070  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2071  v16u8 out4, out5;
2072  v8i16 filt, out0, out1, out2, out3;
2073 
2074  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2075  src -= 1;
2076 
2077  /* rearranging filter */
2078  filt = LD_SH(filter);
2079  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2080 
2081  mask1 = mask0 + 2;
2082 
2083  LD_SB4(src, src_stride, src0, src1, src2, src3);
2084  src += (4 * src_stride);
2085 
2086  XORI_B4_128_SB(src0, src1, src2, src3);
2087  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2088  filt1, out0, out1, out2, out3);
2089  SRARI_H4_SH(out0, out1, out2, out3, 6);
2090  SAT_SH4_SH(out0, out1, out2, out3, 7);
2091  out4 = PCKEV_XORI128_UB(out0, out1);
2092  out5 = PCKEV_XORI128_UB(out2, out3);
2093  ST_W2(out4, 0, 2, dst, dst_stride);
2094  ST_H2(out4, 2, 6, dst + 4, dst_stride);
2095  ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2096  ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2097  dst += (4 * dst_stride);
2098 
2099  LD_SB4(src, src_stride, src0, src1, src2, src3);
2100  src += (4 * src_stride);
2101 
2102  XORI_B4_128_SB(src0, src1, src2, src3);
2103  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2104  filt1, out0, out1, out2, out3);
2105  SRARI_H4_SH(out0, out1, out2, out3, 6);
2106  SAT_SH4_SH(out0, out1, out2, out3, 7);
2107  out4 = PCKEV_XORI128_UB(out0, out1);
2108  out5 = PCKEV_XORI128_UB(out2, out3);
2109  ST_W2(out4, 0, 2, dst, dst_stride);
2110  ST_H2(out4, 2, 6, dst + 4, dst_stride);
2111  ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2112  ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2113 }
2114 
2115 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2116  uint8_t *dst, int32_t dst_stride,
2117  const int8_t *filter, int32_t height)
2118 {
2119  uint32_t loop_cnt;
2120  v16i8 src0, src1, filt0, filt1, mask0, mask1;
2121  v16u8 out;
2122  v8i16 filt, vec0, vec1, vec2, vec3;
2123 
2124  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2125  src -= 1;
2126 
2127  filt = LD_SH(filter);
2128  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2129 
2130  mask1 = mask0 + 2;
2131 
2132  for (loop_cnt = (height >> 1); loop_cnt--;) {
2133  LD_SB2(src, src_stride, src0, src1);
2134  src += (2 * src_stride);
2135 
2137  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2138  DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2139  VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2140  DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2141  SRARI_H2_SH(vec0, vec1, 6);
2142  SAT_SH2_SH(vec0, vec1, 7);
2143  out = PCKEV_XORI128_UB(vec0, vec1);
2144  ST_D2(out, 0, 1, dst, dst_stride);
2145  dst += (2 * dst_stride);
2146  }
2147 }
2148 
2149 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2150  uint8_t *dst, int32_t dst_stride,
2151  const int8_t *filter, int32_t height)
2152 {
2153  uint32_t loop_cnt;
2154  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2155  v16u8 tmp0, tmp1;
2156  v8i16 filt, out0, out1, out2, out3;
2157 
2158  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2159  src -= 1;
2160 
2161  /* rearranging filter */
2162  filt = LD_SH(filter);
2163  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2164 
2165  mask1 = mask0 + 2;
2166 
2167  for (loop_cnt = (height >> 2); loop_cnt--;) {
2168  LD_SB4(src, src_stride, src0, src1, src2, src3);
2169  src += (4 * src_stride);
2170 
2171  XORI_B4_128_SB(src0, src1, src2, src3);
2172  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2173  filt1, out0, out1, out2, out3);
2174  SRARI_H4_SH(out0, out1, out2, out3, 6);
2175  SAT_SH4_SH(out0, out1, out2, out3, 7);
2176  tmp0 = PCKEV_XORI128_UB(out0, out1);
2177  tmp1 = PCKEV_XORI128_UB(out2, out3);
2178  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2179  dst += (4 * dst_stride);
2180  }
2181 }
2182 
2183 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2184  uint8_t *dst, int32_t dst_stride,
2185  const int8_t *filter, int32_t height)
2186 {
2187  if ((2 == height) || (6 == height)) {
2188  common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2189  height);
2190  } else {
2191  common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2192  height);
2193  }
2194 }
2195 
2196 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2197  uint8_t *dst, int32_t dst_stride,
2198  const int8_t *filter, int32_t height)
2199 {
2200  uint32_t loop_cnt;
2201  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2202  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2203  v16i8 vec10, vec11;
2204  v16u8 tmp0, tmp1;
2205  v8i16 filt, out0, out1, out2, out3, out4, out5;
2206 
2207  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2208  mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2209 
2210  src -= 1;
2211 
2212  /* rearranging filter */
2213  filt = LD_SH(filter);
2214  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2215 
2216  mask1 = mask0 + 2;
2217  mask3 = mask2 + 2;
2218 
2219  for (loop_cnt = 4; loop_cnt--;) {
2220  LD_SB4(src, src_stride, src0, src1, src2, src3);
2221  src += (4 * src_stride);
2222 
2223  XORI_B4_128_SB(src0, src1, src2, src3);
2224  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2225  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2226  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2227  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2228  SRARI_H2_SH(out0, out1, 6);
2229  SAT_SH2_SH(out0, out1, 7);
2230  tmp0 = PCKEV_XORI128_UB(out0, out1);
2231  ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2232 
2233  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2234  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2235  DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2236  out2, out3, out4, out5);
2237  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2238  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2239  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2240  out2, out3, out4, out5);
2241  SRARI_H4_SH(out2, out3, out4, out5, 6);
2242  SAT_SH4_SH(out2, out3, out4, out5, 7);
2243  tmp0 = PCKEV_XORI128_UB(out2, out3);
2244  tmp1 = PCKEV_XORI128_UB(out4, out5);
2245  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2246  dst += (4 * dst_stride);
2247  }
2248 }
2249 
2250 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2251  uint8_t *dst, int32_t dst_stride,
2252  const int8_t *filter, int32_t height)
2253 {
2254  uint32_t loop_cnt;
2255  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2256  v16i8 filt0, filt1, mask0, mask1;
2257  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2258  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2259  v16u8 out;
2260 
2261  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2262  src -= 1;
2263 
2264  /* rearranging filter */
2265  filt = LD_SH(filter);
2266  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2267 
2268  mask1 = mask0 + 2;
2269 
2270  for (loop_cnt = (height >> 2); loop_cnt--;) {
2271  LD_SB4(src, src_stride, src0, src2, src4, src6);
2272  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2273  src += (4 * src_stride);
2274 
2275  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2276 
2277  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2278  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2279  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2280  out0, out1, out2, out3);
2281  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2282  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2283  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2284  out0, out1, out2, out3);
2285  SRARI_H4_SH(out0, out1, out2, out3, 6);
2286  SAT_SH4_SH(out0, out1, out2, out3, 7);
2287  out = PCKEV_XORI128_UB(out0, out1);
2288  ST_UB(out, dst);
2289  dst += dst_stride;
2290  out = PCKEV_XORI128_UB(out2, out3);
2291  ST_UB(out, dst);
2292  dst += dst_stride;
2293 
2294  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2295  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2296  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2297  out4, out5, out6, out7);
2298  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2299  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2300  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2301  out4, out5, out6, out7);
2302  SRARI_H4_SH(out4, out5, out6, out7, 6);
2303  SAT_SH4_SH(out4, out5, out6, out7, 7);
2304  out = PCKEV_XORI128_UB(out4, out5);
2305  ST_UB(out, dst);
2306  dst += dst_stride;
2307  out = PCKEV_XORI128_UB(out6, out7);
2308  ST_UB(out, dst);
2309  dst += dst_stride;
2310  }
2311 }
2312 
2313 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2314  uint8_t *dst, int32_t dst_stride,
2315  const int8_t *filter, int32_t height)
2316 {
2317  uint8_t *dst1 = dst + 16;
2318  uint32_t loop_cnt;
2319  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2320  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2321  v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2322  v8i16 filt, out0, out1, out2, out3;
2323  v16u8 tmp0, tmp1;
2324 
2325  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2326  src -= 1;
2327 
2328  /* rearranging filter */
2329  filt = LD_SH(filter);
2330  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2331 
2332  mask1 = mask0 + 2;
2333  mask00 = mask0 + 8;
2334  mask11 = mask0 + 10;
2335 
2336  for (loop_cnt = 8; loop_cnt--;) {
2337  LD_SB4(src, src_stride, src0, src2, src4, src6);
2338  LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2339  src += (4 * src_stride);
2340 
2341  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2342  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2343  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2344  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2345  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2346  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2347  out0, out1, out2, out3);
2348  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2349  out0, out1, out2, out3);
2350  SRARI_H4_SH(out0, out1, out2, out3, 6);
2351  SAT_SH4_SH(out0, out1, out2, out3, 7);
2352  tmp0 = PCKEV_XORI128_UB(out0, out1);
2353  ST_UB(tmp0, dst);
2354  dst += dst_stride;
2355  tmp0 = PCKEV_XORI128_UB(out2, out3);
2356  ST_UB(tmp0, dst);
2357  dst += dst_stride;
2358 
2359  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2360  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2361  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2362  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2363  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2364  out0, out1, out2, out3);
2365  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2366  out0, out1, out2, out3);
2367  SRARI_H4_SH(out0, out1, out2, out3, 6);
2368  SAT_SH4_SH(out0, out1, out2, out3, 7);
2369  tmp0 = PCKEV_XORI128_UB(out0, out1);
2370  ST_UB(tmp0, dst);
2371  dst += dst_stride;
2372  tmp0 = PCKEV_XORI128_UB(out2, out3);
2373  ST_UB(tmp0, dst);
2374  dst += dst_stride;
2375 
2376  /* 8 width */
2377  VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2378  VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2379  VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2380  VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2381 
2382  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2383  out0, out1, out2, out3);
2384  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2385  out0, out1, out2, out3);
2386 
2387  SRARI_H4_SH(out0, out1, out2, out3, 6);
2388  SAT_SH4_SH(out0, out1, out2, out3, 7);
2389  tmp0 = PCKEV_XORI128_UB(out0, out1);
2390  tmp1 = PCKEV_XORI128_UB(out2, out3);
2391  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2392  dst1 += (4 * dst_stride);
2393  }
2394 }
2395 
2396 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2397  uint8_t *dst, int32_t dst_stride,
2398  const int8_t *filter, int32_t height)
2399 {
2400  uint32_t loop_cnt;
2401  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2402  v16i8 filt0, filt1, mask0, mask1;
2403  v16u8 out;
2404  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2405  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2406 
2407  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2408  src -= 1;
2409 
2410  /* rearranging filter */
2411  filt = LD_SH(filter);
2412  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2413 
2414  mask1 = mask0 + 2;
2415 
2416  for (loop_cnt = (height >> 1); loop_cnt--;) {
2417  src0 = LD_SB(src);
2418  src1 = LD_SB(src + 8);
2419  src2 = LD_SB(src + 16);
2420  src3 = LD_SB(src + 24);
2421  src += src_stride;
2422  src4 = LD_SB(src);
2423  src5 = LD_SB(src + 8);
2424  src6 = LD_SB(src + 16);
2425  src7 = LD_SB(src + 24);
2426  src += src_stride;
2427 
2428  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2429 
2430  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2431  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2432  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2433  out0, out1, out2, out3);
2434  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2435  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2436  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2437  out0, out1, out2, out3);
2438 
2439  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2440  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2441  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2442  out4, out5, out6, out7);
2443  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2444  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2445  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2446  out4, out5, out6, out7);
2447  SRARI_H4_SH(out0, out1, out2, out3, 6);
2448  SRARI_H4_SH(out4, out5, out6, out7, 6);
2449  SAT_SH4_SH(out0, out1, out2, out3, 7);
2450  SAT_SH4_SH(out4, out5, out6, out7, 7);
2451  out = PCKEV_XORI128_UB(out0, out1);
2452  ST_UB(out, dst);
2453  out = PCKEV_XORI128_UB(out2, out3);
2454  ST_UB(out, dst + 16);
2455  dst += dst_stride;
2456  out = PCKEV_XORI128_UB(out4, out5);
2457  ST_UB(out, dst);
2458  out = PCKEV_XORI128_UB(out6, out7);
2459  ST_UB(out, dst + 16);
2460  dst += dst_stride;
2461  }
2462 }
2463 
2464 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2465  uint8_t *dst, int32_t dst_stride,
2466  const int8_t *filter)
2467 {
2468  v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2469  v16i8 src2110, src4332, filt0, filt1;
2470  v16u8 out;
2471  v8i16 filt, out10;
2472 
2473  src -= src_stride;
2474 
2475  filt = LD_SH(filter);
2476  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2477 
2478  LD_SB3(src, src_stride, src0, src1, src2);
2479  src += (3 * src_stride);
2480 
2481  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2482  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2483  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2484  LD_SB2(src, src_stride, src3, src4);
2485  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2486  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2487  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2488  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2489  out10 = __msa_srari_h(out10, 6);
2490  out10 = __msa_sat_s_h(out10, 7);
2491  out = PCKEV_XORI128_UB(out10, out10);
2492  ST_W2(out, 0, 1, dst, dst_stride);
2493 }
2494 
2496  uint8_t *dst, int32_t dst_stride,
2497  const int8_t *filter, int32_t height)
2498 {
2499  uint32_t loop_cnt;
2500  v16i8 src0, src1, src2, src3, src4, src5;
2501  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2502  v16i8 src2110, src4332, filt0, filt1;
2503  v8i16 filt, out10, out32;
2504  v16u8 out;
2505 
2506  src -= src_stride;
2507 
2508  filt = LD_SH(filter);
2509  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2510 
2511  LD_SB3(src, src_stride, src0, src1, src2);
2512  src += (3 * src_stride);
2513 
2514  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2515 
2516  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2517  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2518 
2519  for (loop_cnt = (height >> 2); loop_cnt--;) {
2520  LD_SB3(src, src_stride, src3, src4, src5);
2521  src += (3 * src_stride);
2522  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2523  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2524  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2525  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2526 
2527  src2 = LD_SB(src);
2528  src += (src_stride);
2529  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2530  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2531  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2532  out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2533  SRARI_H2_SH(out10, out32, 6);
2534  SAT_SH2_SH(out10, out32, 7);
2535  out = PCKEV_XORI128_UB(out10, out32);
2536  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2537  dst += (4 * dst_stride);
2538  }
2539 }
2540 
2541 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2542  uint8_t *dst, int32_t dst_stride,
2543  const int8_t *filter, int32_t height)
2544 {
2545  if (2 == height) {
2546  common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2547  } else {
2548  common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2549  height);
2550  }
2551 }
2552 
2553 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2554  uint8_t *dst, int32_t dst_stride,
2555  const int8_t *filter, int32_t height)
2556 {
2557  v16u8 out0, out1;
2558  v16i8 src0, src1, src2, src3, src4, src5, src6;
2559  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2560  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2561 
2562  src -= src_stride;
2563 
2564  filter_vec = LD_SH(filter);
2565  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2566 
2567  LD_SB3(src, src_stride, src0, src1, src2);
2568  src += (3 * src_stride);
2569  XORI_B3_128_SB(src0, src1, src2);
2570  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2571 
2572  LD_SB2(src, src_stride, src3, src4);
2573  src += (2 * src_stride);
2574  XORI_B2_128_SB(src3, src4);
2575  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2576 
2577  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2578  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2579 
2580  LD_SB2(src, src_stride, src5, src6);
2581  src += (2 * src_stride);
2582  XORI_B2_128_SB(src5, src6);
2583  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2584 
2585  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2586  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2587 
2588  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2589  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2590  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2591  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2592  ST_W2(out0, 0, 2, dst, dst_stride);
2593  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2594  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2595  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2596  dst += (4 * dst_stride);
2597 
2598  LD_SB2(src, src_stride, src3, src4);
2599  src += (2 * src_stride);
2600  XORI_B2_128_SB(src3, src4);
2601  ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2602 
2603  dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2604  dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2605 
2606  LD_SB2(src, src_stride, src5, src6);
2607  src += (2 * src_stride);
2608  XORI_B2_128_SB(src5, src6);
2609  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2610 
2611  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2612  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2613 
2614  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2615  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2616  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2617  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2618  ST_W2(out0, 0, 2, dst, dst_stride);
2619  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2620  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2621  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2622 }
2623 
2624 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2625  uint8_t *dst, int32_t dst_stride,
2626  const int8_t *filter)
2627 {
2628  v16i8 src0, src1, src2, src3, src4;
2629  v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2630  v16u8 out;
2631 
2632  src -= src_stride;
2633 
2634  /* rearranging filter_y */
2635  filt = LD_SH(filter);
2636  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2637 
2638  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2639  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2640  ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2641  tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2642  ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2643  tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2644  SRARI_H2_SH(tmp0, tmp1, 6);
2645  SAT_SH2_SH(tmp0, tmp1, 7);
2646  out = PCKEV_XORI128_UB(tmp0, tmp1);
2647  ST_D2(out, 0, 1, dst, dst_stride);
2648 }
2649 
2650 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2651  uint8_t *dst, int32_t dst_stride,
2652  const int8_t *filter)
2653 {
2654  uint32_t loop_cnt;
2655  uint64_t out0, out1, out2;
2656  v16i8 src0, src1, src2, src3, src4, src5;
2657  v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2658  v8i16 filt, filt0, filt1;
2659 
2660  src -= src_stride;
2661 
2662  /* rearranging filter_y */
2663  filt = LD_SH(filter);
2664  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2665 
2666  LD_SB3(src, src_stride, src0, src1, src2);
2667  src += (3 * src_stride);
2668 
2669  XORI_B3_128_SB(src0, src1, src2);
2670  ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2671 
2672  for (loop_cnt = 2; loop_cnt--;) {
2673  LD_SB3(src, src_stride, src3, src4, src5);
2674  src += (3 * src_stride);
2675 
2676  XORI_B3_128_SB(src3, src4, src5);
2677  ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2678  tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2679  tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2680  tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2681  SRARI_H2_SH(tmp0, tmp1, 6);
2682  tmp2 = __msa_srari_h(tmp2, 6);
2683  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2684  PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2685  XORI_B2_128_SH(tmp0, tmp2);
2686 
2687  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2688  out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2689  out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2690  SD(out0, dst);
2691  dst += dst_stride;
2692  SD(out1, dst);
2693  dst += dst_stride;
2694  SD(out2, dst);
2695  dst += dst_stride;
2696 
2697  src2 = src5;
2698  vec0 = vec3;
2699  vec2 = vec4;
2700  }
2701 }
2702 
2703 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2704  uint8_t *dst, int32_t dst_stride,
2705  const int8_t *filter, int32_t height)
2706 {
2707  uint32_t loop_cnt;
2708  v16i8 src0, src1, src2, src7, src8, src9, src10;
2709  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2710  v16u8 tmp0, tmp1;
2711  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2712 
2713  src -= src_stride;
2714 
2715  filt = LD_SH(filter);
2716  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2717 
2718  LD_SB3(src, src_stride, src0, src1, src2);
2719  src += (3 * src_stride);
2720 
2721  XORI_B3_128_SB(src0, src1, src2);
2722  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2723 
2724  for (loop_cnt = (height >> 2); loop_cnt--;) {
2725  LD_SB4(src, src_stride, src7, src8, src9, src10);
2726  src += (4 * src_stride);
2727 
2728  XORI_B4_128_SB(src7, src8, src9, src10);
2729  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2730  src72_r, src87_r, src98_r, src109_r);
2731  out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2732  out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2733  out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2734  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2735  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2736  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2737  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2738  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2739  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2740  dst += (4 * dst_stride);
2741 
2742  src10_r = src98_r;
2743  src21_r = src109_r;
2744  src2 = src10;
2745  }
2746 }
2747 
2748 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2749  uint8_t *dst, int32_t dst_stride,
2750  const int8_t *filter, int32_t height)
2751 {
2752  if (2 == height) {
2753  common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2754  } else if (6 == height) {
2755  common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2756  } else {
2757  common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2758  filter, height);
2759  }
2760 }
2761 
2762 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2763  uint8_t *dst, int32_t dst_stride,
2764  const int8_t *filter, int32_t height)
2765 {
2766  uint32_t loop_cnt;
2767  v16i8 src0, src1, src2, src3, src4, src5, src6;
2768  v16u8 out0, out1;
2769  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2770  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2771  v16i8 src2110, src4332, src6554;
2772  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2773  v8i16 filter_vec;
2774 
2775  src -= (1 * src_stride);
2776 
2777  filter_vec = LD_SH(filter);
2778  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2779 
2780  LD_SB3(src, src_stride, src0, src1, src2);
2781  src += (3 * src_stride);
2782 
2783  XORI_B3_128_SB(src0, src1, src2);
2784  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2785  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2786  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2787 
2788  for (loop_cnt = 4; loop_cnt--;) {
2789  LD_SB4(src, src_stride, src3, src4, src5, src6);
2790  src += (4 * src_stride);
2791 
2792  XORI_B4_128_SB(src3, src4, src5, src6);
2793  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2794  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2795  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2796  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2797  ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2798  src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2799 
2800  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2801  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2802  dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2803  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2804  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2805  dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2806 
2807  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2808  SRARI_H2_SH(dst0_l, dst1_l, 6);
2809  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2810  SAT_SH2_SH(dst0_l, dst1_l, 7);
2811  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2812  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2813  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2814  out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2815  ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2816  dst += (4 * dst_stride);
2817 
2818  src2 = src6;
2819  src10_r = src54_r;
2820  src21_r = src65_r;
2821  src2110 = src6554;
2822  }
2823 }
2824 
2825 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2826  uint8_t *dst, int32_t dst_stride,
2827  const int8_t *filter, int32_t height)
2828 {
2829  uint32_t loop_cnt;
2830  v16i8 src0, src1, src2, src3, src4, src5, src6;
2831  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2832  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2833  v16u8 tmp0, tmp1, tmp2, tmp3;
2834  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2835 
2836  src -= src_stride;
2837 
2838  filt = LD_SH(filter);
2839  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2840 
2841  LD_SB3(src, src_stride, src0, src1, src2);
2842  src += (3 * src_stride);
2843 
2844  XORI_B3_128_SB(src0, src1, src2);
2845  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2846  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2847 
2848  for (loop_cnt = (height >> 2); loop_cnt--;) {
2849  LD_SB4(src, src_stride, src3, src4, src5, src6);
2850  src += (4 * src_stride);
2851 
2852  XORI_B4_128_SB(src3, src4, src5, src6);
2853  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2854  src32_r, src43_r, src54_r, src65_r);
2855  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2856  src32_l, src43_l, src54_l, src65_l);
2857  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2858  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2859  out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2860  out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2861  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2862  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2863  out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2864  out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2865  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2866  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2867  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2868  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2869  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2870  out3_r, tmp0, tmp1, tmp2, tmp3);
2871  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2872  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2873  dst += (4 * dst_stride);
2874 
2875  src10_r = src54_r;
2876  src21_r = src65_r;
2877  src10_l = src54_l;
2878  src21_l = src65_l;
2879  src2 = src6;
2880  }
2881 }
2882 
2883 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2884  uint8_t *dst, int32_t dst_stride,
2885  const int8_t *filter, int32_t height)
2886 {
2887  uint32_t loop_cnt;
2888  uint64_t out0, out1;
2889  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2890  v16i8 src11, filt0, filt1;
2891  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2892  v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2893  v16u8 out;
2894  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2895 
2896  src -= src_stride;
2897 
2898  filt = LD_SH(filter);
2899  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2900 
2901  /* 16 width */
2902  LD_SB3(src, src_stride, src0, src1, src2);
2903  XORI_B3_128_SB(src0, src1, src2);
2904  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2905  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2906 
2907  /* 8 width */
2908  LD_SB3(src + 16, src_stride, src6, src7, src8);
2909  src += (3 * src_stride);
2910  XORI_B3_128_SB(src6, src7, src8);
2911  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2912 
2913  for (loop_cnt = 8; loop_cnt--;) {
2914  /* 16 width */
2915  LD_SB2(src, src_stride, src3, src4);
2916  XORI_B2_128_SB(src3, src4);
2917  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2918  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2919 
2920  /* 8 width */
2921  LD_SB2(src + 16, src_stride, src9, src10);
2922  src += (2 * src_stride);
2923  XORI_B2_128_SB(src9, src10);
2924  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2925 
2926  /* 16 width */
2927  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2928  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2929  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2930  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2931 
2932  /* 8 width */
2933  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2934  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2935 
2936  /* 16 + 8 width */
2937  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2938  SRARI_H2_SH(out0_l, out1_l, 6);
2939  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2940  SAT_SH2_SH(out0_l, out1_l, 7);
2941  out = PCKEV_XORI128_UB(out0_r, out0_l);
2942  ST_UB(out, dst);
2943  PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2944  XORI_B2_128_SH(out2_r, out3_r);
2945  out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2946  out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2947  SD(out0, dst + 16);
2948  dst += dst_stride;
2949  out = PCKEV_XORI128_UB(out1_r, out1_l);
2950  ST_UB(out, dst);
2951  SD(out1, dst + 16);
2952  dst += dst_stride;
2953 
2954  /* 16 width */
2955  LD_SB2(src, src_stride, src5, src2);
2956  XORI_B2_128_SB(src5, src2);
2957  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2958  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2959 
2960  /* 8 width */
2961  LD_SB2(src + 16, src_stride, src11, src8);
2962  src += (2 * src_stride);
2963  XORI_B2_128_SB(src11, src8);
2964  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2965 
2966  /* 16 width */
2967  out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2968  out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2969  out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2970  out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2971 
2972  /* 8 width */
2973  out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2974  out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2975 
2976  /* 16 + 8 width */
2977  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2978  SRARI_H2_SH(out0_l, out1_l, 6);
2979  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2980  SAT_SH2_SH(out0_l, out1_l, 7);
2981  out = PCKEV_XORI128_UB(out0_r, out0_l);
2982  ST_UB(out, dst);
2983  out = PCKEV_XORI128_UB(out2_r, out2_r);
2984  ST_D1(out, 0, dst + 16);
2985  dst += dst_stride;
2986  out = PCKEV_XORI128_UB(out1_r, out1_l);
2987  ST_UB(out, dst);
2988  out = PCKEV_XORI128_UB(out3_r, out3_r);
2989  ST_D1(out, 0, dst + 16);
2990  dst += dst_stride;
2991  }
2992 }
2993 
2994 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2995  uint8_t *dst, int32_t dst_stride,
2996  const int8_t *filter, int32_t height)
2997 {
2998  uint32_t loop_cnt;
2999  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3000  v16i8 src10_r, src32_r, src76_r, src98_r;
3001  v16i8 src21_r, src43_r, src87_r, src109_r;
3002  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3003  v16i8 src10_l, src32_l, src76_l, src98_l;
3004  v16i8 src21_l, src43_l, src87_l, src109_l;
3005  v8i16 filt;
3006  v16i8 filt0, filt1;
3007  v16u8 out;
3008 
3009  src -= src_stride;
3010 
3011  filt = LD_SH(filter);
3012  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3013 
3014  /* 16 width */
3015  LD_SB3(src, src_stride, src0, src1, src2);
3016  XORI_B3_128_SB(src0, src1, src2);
3017 
3018  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3019  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3020 
3021  /* next 16 width */
3022  LD_SB3(src + 16, src_stride, src6, src7, src8);
3023  src += (3 * src_stride);
3024 
3025  XORI_B3_128_SB(src6, src7, src8);
3026  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3027  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3028 
3029  for (loop_cnt = (height >> 1); loop_cnt--;) {
3030  /* 16 width */
3031  LD_SB2(src, src_stride, src3, src4);
3032  XORI_B2_128_SB(src3, src4);
3033  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3034  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3035 
3036  /* 16 width */
3037  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3038  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3039  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3040  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3041 
3042  /* 16 width */
3043  SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3044  SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3045  out = PCKEV_XORI128_UB(out0_r, out0_l);
3046  ST_UB(out, dst);
3047  out = PCKEV_XORI128_UB(out1_r, out1_l);
3048  ST_UB(out, dst + dst_stride);
3049 
3050  src10_r = src32_r;
3051  src21_r = src43_r;
3052  src10_l = src32_l;
3053  src21_l = src43_l;
3054  src2 = src4;
3055 
3056  /* next 16 width */
3057  LD_SB2(src + 16, src_stride, src9, src10);
3058  src += (2 * src_stride);
3059  XORI_B2_128_SB(src9, src10);
3060  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3061  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3062 
3063  /* next 16 width */
3064  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3065  out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3066  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3067  out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3068 
3069  /* next 16 width */
3070  SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3071  SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3072  out = PCKEV_XORI128_UB(out2_r, out2_l);
3073  ST_UB(out, dst + 16);
3074  out = PCKEV_XORI128_UB(out3_r, out3_l);
3075  ST_UB(out, dst + 16 + dst_stride);
3076 
3077  dst += 2 * dst_stride;
3078 
3079  src76_r = src98_r;
3080  src87_r = src109_r;
3081  src76_l = src98_l;
3082  src87_l = src109_l;
3083  src8 = src10;
3084  }
3085 }
3086 
3088  int32_t src_stride,
3089  uint8_t *dst,
3090  int32_t dst_stride,
3091  const int8_t *filter_x,
3092  const int8_t *filter_y)
3093 {
3094  v16u8 out;
3095  v16i8 src0, src1, src2, src3, src4;
3096  v8i16 filt0, filt1;
3097  v8i16 filt_h0, filt_h1;
3098  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3099  v16i8 mask1;
3100  v8i16 filter_vec, tmp;
3101  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3102  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3103  v4i32 dst0, dst1;
3104 
3105  src -= (src_stride + 1);
3106 
3107  filter_vec = LD_SH(filter_x);
3108  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3109 
3110  filter_vec = LD_SH(filter_y);
3111  UNPCK_R_SB_SH(filter_vec, filter_vec);
3112 
3113  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3114 
3115  mask1 = mask0 + 2;
3116 
3117  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3118  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3119 
3120  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3121  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3122  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3123 
3124  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3125  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3126  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3127 
3128  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3129  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3130 
3131  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3132  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3133  dst0 >>= 6;
3134  dst1 >>= 6;
3135  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3136  tmp = __msa_srari_h(tmp, 6);
3137  tmp = __msa_sat_s_h(tmp, 7);
3139  ST_W2(out, 0, 1, dst, dst_stride);
3140 }
3141 
3143  int32_t src_stride,
3144  uint8_t *dst,
3145  int32_t dst_stride,
3146  const int8_t *filter_x,
3147  const int8_t *filter_y)
3148 {
3149  v16u8 out;
3150  v16i8 src0, src1, src2, src3, src4, src5, src6;
3151  v8i16 filt0, filt1;
3152  v8i16 filt_h0, filt_h1;
3153  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3154  v16i8 mask1;
3155  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3156  v8i16 filter_vec, tmp0, tmp1;
3157  v8i16 dst30, dst41, dst52, dst63;
3158  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3159  v4i32 dst0, dst1, dst2, dst3;
3160 
3161  src -= (src_stride + 1);
3162 
3163  filter_vec = LD_SH(filter_x);
3164  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3165 
3166  filter_vec = LD_SH(filter_y);
3167  UNPCK_R_SB_SH(filter_vec, filter_vec);
3168 
3169  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3170 
3171  mask1 = mask0 + 2;
3172 
3173  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3174  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3175 
3176  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3177  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3178  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3179  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3180 
3181  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3182  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3183  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3184  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3185 
3186  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3187  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3188  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3189  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3190  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3191  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3192  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3193  SRA_4V(dst0, dst1, dst2, dst3, 6);
3194  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3195  SRARI_H2_SH(tmp0, tmp1, 6);
3196  SAT_SH2_SH(tmp0, tmp1, 7);
3197  out = PCKEV_XORI128_UB(tmp0, tmp1);
3198  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3199 }
3200 
3202  int32_t src_stride,
3203  uint8_t *dst,
3204  int32_t dst_stride,
3205  const int8_t *filter_x,
3206  const int8_t *filter_y,
3207  int32_t height)
3208 {
3209  uint32_t loop_cnt;
3210  v16u8 out0, out1;
3211  v16i8 src0, src1, src2, src3, src4, src5;
3212  v16i8 src6, src7, src8, src9, src10;
3213  v8i16 filt0, filt1;
3214  v8i16 filt_h0, filt_h1;
3215  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3216  v16i8 mask1;
3217  v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3218  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3219  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3220  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3221  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3222  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3223  v8i16 dst98_r, dst109_r;
3224 
3225  src -= (src_stride + 1);
3226 
3227  filter_vec = LD_SH(filter_x);
3228  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3229 
3230  filter_vec = LD_SH(filter_y);
3231  UNPCK_R_SB_SH(filter_vec, filter_vec);
3232 
3233  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3234 
3235  mask1 = mask0 + 2;
3236 
3237  LD_SB3(src, src_stride, src0, src1, src2);
3238  src += (3 * src_stride);
3239 
3240  XORI_B3_128_SB(src0, src1, src2);
3241 
3242  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3243  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3244  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3245  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3246  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3247  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3248 
3249  for (loop_cnt = height >> 3; loop_cnt--;) {
3250  LD_SB8(src, src_stride,
3251  src3, src4, src5, src6, src7, src8, src9, src10);
3252  src += (8 * src_stride);
3253 
3254  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3255 
3256  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3257  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3258  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3259  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3260 
3261  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3262  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3263  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3264  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3265 
3266  dst32_r = __msa_ilvr_h(dst73, dst22);
3267  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3268  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3269  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3270  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3271  dst76_r = __msa_ilvr_h(dst22, dst106);
3272 
3273  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3274  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3275  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3276  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3277  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3278  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3279  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3280  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3281  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3282  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3283  PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3284  dst5_r, dst4_r, dst7_r, dst6_r,
3285  tmp0, tmp1, tmp2, tmp3);
3286  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3287  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3288  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3289  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3290  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3291  dst += (8 * dst_stride);
3292 
3293  dst10_r = dst98_r;
3294  dst21_r = dst109_r;
3295  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3296  }
3297 }
3298 
3300  int32_t src_stride,
3301  uint8_t *dst,
3302  int32_t dst_stride,
3303  const int8_t *filter_x,
3304  const int8_t *filter_y,
3305  int32_t height)
3306 {
3307  if (2 == height) {
3308  hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3309  filter_x, filter_y);
3310  } else if (4 == height) {
3311  hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3312  filter_x, filter_y);
3313  } else if (0 == (height % 8)) {
3314  hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3315  filter_x, filter_y, height);
3316  }
3317 }
3318 
3320  int32_t src_stride,
3321  uint8_t *dst,
3322  int32_t dst_stride,
3323  const int8_t *filter_x,
3324  const int8_t *filter_y,
3325  int32_t height)
3326 {
3327  v16u8 out0, out1, out2;
3328  v16i8 src0, src1, src2, src3, src4, src5, src6;
3329  v16i8 src7, src8, src9, src10;
3330  v8i16 filt0, filt1;
3331  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3332  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3333  v16i8 mask1;
3334  v8i16 filt_h0, filt_h1, filter_vec;
3335  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3336  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3337  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3338  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3339  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3340  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3341  v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3342  v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3343  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3344 
3345  src -= (src_stride + 1);
3346 
3347  filter_vec = LD_SH(filter_x);
3348  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3349 
3350  filter_vec = LD_SH(filter_y);
3351  UNPCK_R_SB_SH(filter_vec, filter_vec);
3352 
3353  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3354 
3355  mask1 = mask0 + 2;
3356 
3357  LD_SB3(src, src_stride, src0, src1, src2);
3358  src += (3 * src_stride);
3359 
3360  XORI_B3_128_SB(src0, src1, src2);
3361 
3362  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3363  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3364  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3365 
3366  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3367  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3368  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3369 
3370  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3371  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3372 
3373  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3374  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3375 
3376  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3377  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3378  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3379  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3380 
3381  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3382  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3383  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3384  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3385 
3386  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3387  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3388  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3389  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3390 
3391  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3392  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3393  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3394  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3395 
3396  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3397  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3398  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3399  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3400  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3401  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3402  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3403  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3404 
3405  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3406  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3407  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3408 
3409  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3410  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3411  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3412  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3413  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3414  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3415  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3416  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3417  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3418  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3419  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3420  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3421  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3422  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3423  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3424  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3425  PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3426  PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3427  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3428  SRARI_H2_SH(tmp4, tmp5, 6);
3429  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3430  SAT_SH2_SH(tmp4, tmp5,7);
3431  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3432  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3433  out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3434  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3435  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3436 }
3437 
3439  int32_t src_stride,
3440  uint8_t *dst,
3441  int32_t dst_stride,
3442  const int8_t *filter_x,
3443  const int8_t *filter_y)
3444 {
3445  v16u8 out;
3446  v16i8 src0, src1, src2, src3, src4;
3447  v8i16 filt0, filt1;
3448  v8i16 filt_h0, filt_h1, filter_vec;
3449  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3450  v16i8 mask1;
3451  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3452  v8i16 dst0, dst1, dst2, dst3, dst4;
3453  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3454  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3455  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3456  v8i16 out0_r, out1_r;
3457 
3458  src -= (src_stride + 1);
3459 
3460  filter_vec = LD_SH(filter_x);
3461  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3462 
3463  filter_vec = LD_SH(filter_y);
3464  UNPCK_R_SB_SH(filter_vec, filter_vec);
3465 
3466  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3467 
3468  mask1 = mask0 + 2;
3469 
3470  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3471  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3472 
3473  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3474  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3475  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3476  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3477  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3478 
3479  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3480  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3481  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3482  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3483  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3484  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3485  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3486  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3487  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3488  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3489  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3490  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3491  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3492  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3493  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3494  SRARI_H2_SH(out0_r, out1_r, 6);
3495  SAT_SH2_SH(out0_r, out1_r, 7);
3496  out = PCKEV_XORI128_UB(out0_r, out1_r);
3497  ST_D2(out, 0, 1, dst, dst_stride);
3498 }
3499 
3501  int32_t src_stride,
3502  uint8_t *dst,
3503  int32_t dst_stride,
3504  const int8_t *filter_x,
3505  const int8_t *filter_y,
3506  int32_t width8mult)
3507 {
3508  uint32_t cnt;
3509  v16u8 out0, out1;
3510  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3511  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3512  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3513  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3514  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3515  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3516  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3517 
3518  src -= (src_stride + 1);
3519 
3520  filter_vec = LD_SH(filter_x);
3521  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3522 
3523  filter_vec = LD_SH(filter_y);
3524  UNPCK_R_SB_SH(filter_vec, filter_vec);
3525 
3526  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3527 
3528  mask0 = LD_SB(ff_hevc_mask_arr);
3529  mask1 = mask0 + 2;
3530 
3531  for (cnt = width8mult; cnt--;) {
3532  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3533  src += 8;
3534  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3535 
3536  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3537  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3538  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3539 
3540  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3541  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3542  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3543 
3544  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3545  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3546 
3547  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3548  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3549  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3550  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3551 
3552  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3553  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3554  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3555  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3556 
3557  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3558  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3559  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3560  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3561 
3562  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3563  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3564  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3565  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3566  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3567  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3568  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3569  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3570 
3571  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3572  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3573 
3574  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3575  dst3_r, tmp0, tmp1, tmp2, tmp3);
3576  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3577  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3578  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3579  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3580  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3581  dst += 8;
3582  }
3583 }
3584 
3586  int32_t src_stride,
3587  uint8_t *dst,
3588  int32_t dst_stride,
3589  const int8_t *filter_x,
3590  const int8_t *filter_y)
3591 {
3592  v16u8 out0, out1, out2;
3593  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3594  v8i16 filt0, filt1;
3595  v8i16 filt_h0, filt_h1, filter_vec;
3596  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3597  v16i8 mask1;
3598  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3599  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3600  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3601  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3602  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3603  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3604  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3605  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3606  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3607  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3608 
3609  src -= (src_stride + 1);
3610 
3611  filter_vec = LD_SH(filter_x);
3612  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3613 
3614  filter_vec = LD_SH(filter_y);
3615  UNPCK_R_SB_SH(filter_vec, filter_vec);
3616 
3617  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3618 
3619  mask1 = mask0 + 2;
3620 
3621  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3622  src += (5 * src_stride);
3623  LD_SB4(src, src_stride, src5, src6, src7, src8);
3624 
3625  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3626  XORI_B4_128_SB(src5, src6, src7, src8);
3627 
3628  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3629  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3630  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3631  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3632  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3633  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3634  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3635  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3636  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3637 
3638  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3639  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3640  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3641  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3642  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3643  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3644  dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3645  dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3646  dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3647 
3648  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3649  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3650  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3651  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3652  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3653  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3654  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3655  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3656 
3657  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3658  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3659  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3660  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3661  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3662  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3663  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3664  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3665  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3666  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3667  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3668  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3669 
3670  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3671  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3672  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3673  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3674  dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3675  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3676  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3677  SRARI_H2_SH(out4_r, out5_r, 6);
3678  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3679  SAT_SH2_SH(out4_r, out5_r, 7);
3680  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3681  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3682  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3683 
3684  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3685  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3686 }
3687 
3689  int32_t src_stride,
3690  uint8_t *dst,
3691  int32_t dst_stride,
3692  const int8_t *filter_x,
3693  const int8_t *filter_y,
3694  int32_t height,
3695  int32_t width8mult)
3696 {
3697  uint32_t loop_cnt, cnt;
3698  uint8_t *src_tmp;
3699  uint8_t *dst_tmp;
3700  v16u8 out0, out1;
3701  v16i8 src0, src1, src2, src3, src4, src5, src6;
3702  v8i16 filt0, filt1;
3703  v8i16 filt_h0, filt_h1, filter_vec;
3704  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3705  v16i8 mask1;
3706  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3707  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3708  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3709  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3710  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3711  v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3712  v8i16 out0_r, out1_r, out2_r, out3_r;
3713 
3714  src -= (src_stride + 1);
3715 
3716  filter_vec = LD_SH(filter_x);
3717  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3718 
3719  filter_vec = LD_SH(filter_y);
3720  UNPCK_R_SB_SH(filter_vec, filter_vec);
3721 
3722  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3723 
3724  mask1 = mask0 + 2;
3725 
3726  for (cnt = width8mult; cnt--;) {
3727  src_tmp = src;
3728  dst_tmp = dst;
3729 
3730  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3731  src_tmp += (3 * src_stride);
3732 
3733  XORI_B3_128_SB(src0, src1, src2);
3734 
3735  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3736  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3737  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3738 
3739  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3740  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3741  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3742 
3743  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3744  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3745 
3746  for (loop_cnt = (height >> 2); loop_cnt--;) {
3747  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3748  src_tmp += (4 * src_stride);
3749 
3750  XORI_B4_128_SB(src3, src4, src5, src6);
3751 
3752  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3753  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3754  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3755  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3756 
3757  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3758  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3759  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3760  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3761 
3762  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3763  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3764  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3765  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3766 
3767  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3768  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3769  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3770  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3771  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3772  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3773  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3774  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3775 
3776  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3777  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3778 
3779  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3780  dst2_l, dst2_r, dst3_l, dst3_r,
3781  out0_r, out1_r, out2_r, out3_r);
3782 
3783  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3784  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3785  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3786  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3787  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3788  dst_tmp += (4 * dst_stride);
3789 
3790  dst10_r = dst54_r;
3791  dst10_l = dst54_l;
3792  dst21_r = dst65_r;
3793  dst21_l = dst65_l;
3794  dst2 = dst6;
3795  }
3796 
3797  src += 8;
3798  dst += 8;
3799  }
3800 }
3801 
3803  int32_t src_stride,
3804  uint8_t *dst,
3805  int32_t dst_stride,
3806  const int8_t *filter_x,
3807  const int8_t *filter_y,
3808  int32_t height)
3809 {
3810  if (2 == height) {
3811  hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3812  filter_x, filter_y);
3813  } else if (4 == height) {
3814  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3815  filter_x, filter_y, 1);
3816  } else if (6 == height) {
3817  hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3818  filter_x, filter_y);
3819  } else if (0 == (height % 4)) {
3820  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3821  filter_x, filter_y, height, 1);
3822  }
3823 }
3824 
3826  int32_t src_stride,
3827  uint8_t *dst,
3828  int32_t dst_stride,
3829  const int8_t *filter_x,
3830  const int8_t *filter_y,
3831  int32_t height)
3832 {
3833  uint32_t loop_cnt;
3834  uint8_t *src_tmp, *dst_tmp;
3835  v16u8 out0, out1;
3836  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3837  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3838  v16i8 mask0, mask1, mask2, mask3;
3839  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3840  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3841  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3842  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3843  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3844  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3845  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3846  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3847 
3848  src -= (src_stride + 1);
3849 
3850  filter_vec = LD_SH(filter_x);
3851  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3852 
3853  filter_vec = LD_SH(filter_y);
3854  UNPCK_R_SB_SH(filter_vec, filter_vec);
3855 
3856  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3857 
3858  mask0 = LD_SB(ff_hevc_mask_arr);
3859  mask1 = mask0 + 2;
3860 
3861  src_tmp = src;
3862  dst_tmp = dst;
3863 
3864  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3865  src_tmp += (3 * src_stride);
3866 
3867  XORI_B3_128_SB(src0, src1, src2);
3868 
3869  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3870  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3871  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3872 
3873  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3874  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3875  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3876 
3877  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3878  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3879 
3880  for (loop_cnt = 4; loop_cnt--;) {
3881  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3882  src_tmp += (4 * src_stride);
3883  XORI_B4_128_SB(src3, src4, src5, src6);
3884 
3885  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3886  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3887  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3888  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3889 
3890  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3891  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3892  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3893  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3894 
3895  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3896  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3897  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3898  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3899 
3900  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3901  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3902  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3903  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3904  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3905  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3906  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3907  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3908 
3909  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3910  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3911 
3912  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3913  dst3_r, tmp0, tmp1, tmp2, tmp3);
3914  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3915  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3916  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3917  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3918  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3919  dst_tmp += (4 * dst_stride);
3920 
3921  dst10_r = dst54_r;
3922  dst10_l = dst54_l;
3923  dst21_r = dst65_r;
3924  dst21_l = dst65_l;
3925  dsth2 = dsth6;
3926  }
3927 
3928  src += 8;
3929  dst += 8;
3930 
3931  mask2 = LD_SB(ff_hevc_mask_arr + 16);
3932  mask3 = mask2 + 2;
3933 
3934  LD_SB3(src, src_stride, src0, src1, src2);
3935  src += (3 * src_stride);
3936  XORI_B3_128_SB(src0, src1, src2);
3937  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3938  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3939 
3940  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3941  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3942 
3943  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3944  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3945 
3946  for (loop_cnt = 2; loop_cnt--;) {
3947  LD_SB8(src, src_stride,
3948  src3, src4, src5, src6, src7, src8, src9, src10);
3949  src += (8 * src_stride);
3950  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3951  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3952  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3953  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3954  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3955 
3956  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3957  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3958  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3959  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3960 
3961  dst32_r = __msa_ilvr_h(dst73, dst22);
3962  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3963  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3964  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3965  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3966  dst76_r = __msa_ilvr_h(dst22, dst106);
3967 
3968  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3969  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3970  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3971  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3972  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3973  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3974  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3975  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3976  SRA_4V(dst0, dst1, dst2, dst3, 6);
3977  SRA_4V(dst4, dst5, dst6, dst7, 6);
3978  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3979  tmp0, tmp1, tmp2, tmp3);
3980  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3981  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3982  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3983  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3984  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3985  dst += (8 * dst_stride);
3986 
3987  dst10_r = dst98_r;
3988  dst21_r = dst109_r;
3989  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3990  }
3991 }
3992 
3994  int32_t src_stride,
3995  uint8_t *dst,
3996  int32_t dst_stride,
3997  const int8_t *filter_x,
3998  const int8_t *filter_y,
3999  int32_t height)
4000 {
4001  if (4 == height) {
4002  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4003  filter_y, 2);
4004  } else {
4005  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4006  filter_x, filter_y, height, 2);
4007  }
4008 }
4009 
4011  int32_t src_stride,
4012  uint8_t *dst,
4013  int32_t dst_stride,
4014  const int8_t *filter_x,
4015  const int8_t *filter_y,
4016  int32_t height)
4017 {
4018  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4019  filter_x, filter_y, height, 3);
4020 }
4021 
4023  int32_t src_stride,
4024  uint8_t *dst,
4025  int32_t dst_stride,
4026  const int8_t *filter_x,
4027  const int8_t *filter_y,
4028  int32_t height)
4029 {
4030  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4031  filter_x, filter_y, height, 4);
4032 }
4033 
4034 #define UNI_MC_COPY(WIDTH) \
4035 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4036  ptrdiff_t dst_stride, \
4037  uint8_t *src, \
4038  ptrdiff_t src_stride, \
4039  int height, \
4040  intptr_t mx, \
4041  intptr_t my, \
4042  int width) \
4043 { \
4044  copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4045 }
4046 
4054 
4055 #undef UNI_MC_COPY
4056 
4057 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4058 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4059  ptrdiff_t dst_stride, \
4060  uint8_t *src, \
4061  ptrdiff_t src_stride, \
4062  int height, \
4063  intptr_t mx, \
4064  intptr_t my, \
4065  int width) \
4066 { \
4067  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4068  \
4069  common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4070  filter, height); \
4071 }
4072 
4073 UNI_MC(qpel, h, 4, 8, hz, mx);
4074 UNI_MC(qpel, h, 8, 8, hz, mx);
4075 UNI_MC(qpel, h, 12, 8, hz, mx);
4076 UNI_MC(qpel, h, 16, 8, hz, mx);
4077 UNI_MC(qpel, h, 24, 8, hz, mx);
4078 UNI_MC(qpel, h, 32, 8, hz, mx);
4079 UNI_MC(qpel, h, 48, 8, hz, mx);
4080 UNI_MC(qpel, h, 64, 8, hz, mx);
4081 
4082 UNI_MC(qpel, v, 4, 8, vt, my);
4083 UNI_MC(qpel, v, 8, 8, vt, my);
4084 UNI_MC(qpel, v, 12, 8, vt, my);
4085 UNI_MC(qpel, v, 16, 8, vt, my);
4086 UNI_MC(qpel, v, 24, 8, vt, my);
4087 UNI_MC(qpel, v, 32, 8, vt, my);
4088 UNI_MC(qpel, v, 48, 8, vt, my);
4089 UNI_MC(qpel, v, 64, 8, vt, my);
4090 
4091 UNI_MC(epel, h, 4, 4, hz, mx);
4092 UNI_MC(epel, h, 6, 4, hz, mx);
4093 UNI_MC(epel, h, 8, 4, hz, mx);
4094 UNI_MC(epel, h, 12, 4, hz, mx);
4095 UNI_MC(epel, h, 16, 4, hz, mx);
4096 UNI_MC(epel, h, 24, 4, hz, mx);
4097 UNI_MC(epel, h, 32, 4, hz, mx);
4098 
4099 UNI_MC(epel, v, 4, 4, vt, my);
4100 UNI_MC(epel, v, 6, 4, vt, my);
4101 UNI_MC(epel, v, 8, 4, vt, my);
4102 UNI_MC(epel, v, 12, 4, vt, my);
4103 UNI_MC(epel, v, 16, 4, vt, my);
4104 UNI_MC(epel, v, 24, 4, vt, my);
4105 UNI_MC(epel, v, 32, 4, vt, my);
4106 
4107 #undef UNI_MC
4108 
4109 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4110 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4111  ptrdiff_t dst_stride, \
4112  uint8_t *src, \
4113  ptrdiff_t src_stride, \
4114  int height, \
4115  intptr_t mx, \
4116  intptr_t my, \
4117  int width) \
4118 { \
4119  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4120  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4121  \
4122  hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4123  filter_x, filter_y, height); \
4124 }
4125 
4126 UNI_MC_HV(qpel, 4, 8);
4127 UNI_MC_HV(qpel, 8, 8);
4128 UNI_MC_HV(qpel, 12, 8);
4129 UNI_MC_HV(qpel, 16, 8);
4130 UNI_MC_HV(qpel, 24, 8);
4131 UNI_MC_HV(qpel, 32, 8);
4132 UNI_MC_HV(qpel, 48, 8);
4133 UNI_MC_HV(qpel, 64, 8);
4134 
4135 UNI_MC_HV(epel, 4, 4);
4136 UNI_MC_HV(epel, 6, 4);
4137 UNI_MC_HV(epel, 8, 4);
4138 UNI_MC_HV(epel, 12, 4);
4139 UNI_MC_HV(epel, 16, 4);
4140 UNI_MC_HV(epel, 24, 4);
4141 UNI_MC_HV(epel, 32, 4);
4142 
4143 #undef UNI_MC_HV
static const int8_t filt[NUMTAPS *2]
Definition: af_earwax.c:39
uint8_t
int32_t
#define SD
Definition: ccaption_dec.c:926
static av_always_inline void filter(int16_t *output, ptrdiff_t out_stride, const int16_t *low, ptrdiff_t low_stride, const int16_t *high, ptrdiff_t high_stride, int len, int clip)
Definition: cfhddsp.c:27
static int aligned(int val)
Definition: dashdec.c:168
#define LD_UB4(...)
#define ST_UB4(...)
#define DOTP_SB3_SH(...)
#define ILVR_B2_SB(...)
#define PCKEV_H2_SW(...)
#define SAT_SH3_SH(...)
#define XORI_B8_128_SB(...)
#define SPLATI_W4_SH(...)
#define LD4(psrc, stride, out0, out1, out2, out3)
#define PCKEV_B4_UB(...)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define ILVR_D3_SB(...)
#define SPLATI_H2_SH(...)
#define ILVR_H2_SH(...)
#define XORI_B2_128_SB(...)
#define XORI_B7_128_SB(...)
#define ILVR_B4_SB(...)
#define SPLATI_W2_SH(...)
#define VSHF_B4_SB(...)
#define DPADD_SB2_SH(...)
#define LD_SB7(...)
#define XORI_B4_128_UB(...)
#define SRARI_W4_SW(...)
#define ILVL_B4_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
#define XORI_B4_128_SB(...)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
#define LD_SB3(...)
#define LD2(psrc, stride, out0, out1)
#define DOTP_SB4_SH(...)
#define ILVRL_H2_SH(...)
#define UNPCK_R_SB_SH(in, out)
#define XORI_B3_128_SB(...)
#define LD_SB5(...)
#define PCKEV_D2_SH(...)
#define PCKEV_H2_SH(...)
#define ILVR_D2_SB(...)
#define PCKEV_B2_SH(...)
#define ILVR_B3_SH(...)
#define DOTP_SB2_SH(...)
#define SRA_4V(in0, in1, in2, in3, shift)
#define ILVL_H4_SH(...)
#define LD_SB8(...)
#define SAT_SH2_SH(...)
#define SPLATI_H4_SH(...)
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
#define ST_W2(in, idx0, idx1, pdst, stride)
#define ILVR_H4_SH(...)
#define SW(val, pdst)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define LD_SB4(...)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
#define LD_UB8(...)
#define VSHF_B2_SB(...)
#define ILVL_H2_SH(...)
#define SPLATI_H2_SB(...)
#define SPLATI_H4_SB(...)
#define ILVR_B2_SH(...)
#define VSHF_B2_SH(...)
#define DPADD_SB4_SH(...)
#define LD_SB(...)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
#define SRARI_H2_SH(...)
#define SAT_SW4_SW(...)
#define ST_D1(in, idx, pdst)
#define LD_SH(...)
#define LD_SB2(...)
#define XORI_B5_128_SB(...)
#define ST_UB8(...)
#define VSHF_B3_SB(...)
#define XORI_B2_128_SH(...)
#define ST_H2(in, idx0, idx1, pdst, stride)
#define ST_D2(in, idx0, idx1, pdst, stride)
#define SAT_SH4_SH(...)
#define ST_UB(...)
#define LD_UB(...)
#define SRARI_H4_SH(...)
#define PCKEV_H4_SH(...)
#define ILVL_B2_SB(...)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define UNI_MC_HV(PEL, WIDTH, TAP)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define UNI_MC_COPY(WIDTH)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1)
static const uint8_t ff_hevc_mask_arr[16 *3]
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static uint8_t tmp[11]
Definition: aes_ctr.c:27
#define src1
Definition: h264pred.c:140
#define src0
Definition: h264pred.c:139
#define src
Definition: vp8dsp.c:255
FILE * out
Definition: movenc.c:54
#define height
#define width