diff --git a/LICENSE b/LICENSE
index 261eeb9..d6aac6c 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,201 +1,53 @@
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-   1. Definitions.
-
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-
-   END OF TERMS AND CONDITIONS
-
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "[]"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
-   Copyright [yyyy] [name of copyright owner]
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+Apache License
+
+Version 2.0, January 2004
+
+http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document.
+
+"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License.
+
+"Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
+
+"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License.
+
+"Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files.
+
+"Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types.
+
+"Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below).
+
+"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof.
+
+"Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution."
+
+"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions:
+
+You must give any other recipients of the Work or Derivative Works a copy of this License; and
+You must cause any modified files to carry prominent notices stating that You changed the files; and
+You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and
+If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License.
+
+You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License.
+5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
\ No newline at end of file
diff --git a/README.md b/README.md
index 6092beb..02c4c4a 100644
--- a/README.md
+++ b/README.md
@@ -1 +1,64 @@
-# AvxToNeon
\ No newline at end of file
+# AVX TO NEON
+
+- [Overview](#Overview)
+- [License](#License)
+- [Requirements](#requirements)
+- [Guidelines](#Guidelines)
+- [Test](#Test)
+- [More Information](#more-information)
+- [Copyright](#copyright)
+
+## Overview
+
+When applications using Intel intrinsic instructions are ported from the x86 architecture to the Kunpeng architecture, the instructions need to be further developed because the Arm64 instruction names and functions are different from that of x86. As a result, huge porting workloads are caused. In this project, the frequently used AVX instructions are encapsulated as independent modules to reduce repeated development workload. The AVX instructions are replaced with related NEON SIMD instructions, while the instruction names and functions remain unchanged. Users can invoke the corresponding instructions by importing related header files into the application software. 
+
+## License
+
+It is licensed under the [APACHE LICENSE, VERSION 2.0](https://www.apache.org/licenses/LICENSE-2.0). For more information, see the license file.. 
+
+## Requirements
+
+- CPU: Kunpeng 920 
+
+## Guidelines
+
+In the source code directory, the source directory contains the function implementation files. The avx512intrin.h, avxintrin.h, and emmintrin.h files implement instruction translation, and the avx2neon.h file contains the header files of them. Users can execute the instructions if the application software contains avx2neon.h. 
+Users need to add compilation options, for example, ARCH_CFLAGS = -march=armv8-a+fp+simd+crc, when using the header file.
+
+## Test
+
+This project also provides interface test cases for developers. The logic implementation code of test cases is located in the tests directory, and the input data and expected output of the test cases are in the data directory. Use the following commands to perform test cases:
+
+```
+(1) cd tests
+(2) make
+(3) ./test
+```
+
+After the **test** command is executed, information similar to the following is displayed on the console:
+
+```
+Running Test MM512_CASTPS128_PS512
+
+...
+
+Running Test MM256_SET_EPI32
+
+AVX2NEONTest Complete: Passed 265 tests: Failed 0
+```
+
+ All the instructions provided in this project have been verified on CentOS Linux release 7.6.1810 (AltArch) and EulerOS V2.0SP8, and GCC 7.3, GCC 4.8.5, and GCC 9.2.0.
+
+## More Information
+
+For more information, visit
+
+<https://www.huaweicloud.com/kunpeng/software.html>
+
+If you have questions or comments, we encourage you to create an issue on Github.  If you wish to contact the huawei team directly, you can send email to
+
+ [kunpengcompute@huawei.com](mailto:kunpengcompute@huawei.com).
+
+## Copyright
+
+Copyright © 2020 Huawei Corporation. All rights reserved. 
\ No newline at end of file
diff --git a/avx2neon.h b/avx2neon.h
new file mode 100644
index 0000000..98b5ecc
--- /dev/null
+++ b/avx2neon.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ */
+
+#ifndef AVX2NEON_H
+#define AVX2NEON_H
+
+#include <arm_neon.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <assert.h>
+
+#include "emmintrin.h"
+#include "avxintrin.h"
+#include "avx512intrin.h"
+
+#endif
\ No newline at end of file
diff --git a/avx512intrin.h b/avx512intrin.h
new file mode 100644
index 0000000..4029098
--- /dev/null
+++ b/avx512intrin.h
@@ -0,0 +1,1834 @@
+/*
+ * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ */
+
+#ifndef AVX2NEON_H
+#error Never use <avx512intrin.h> directly; include " avx2neon.h" instead.
+#endif
+
+
+#include <arm_neon.h>
+
+#include <math.h>
+#ifdef __cplusplus
+using namespace std;
+#endif
+
+#include "typedefs.h"
+
+typedef union {
+    int8x16_t vect_s8[4];
+    int16x8_t vect_s16[4];
+    int32x4_t vect_s32[4];
+    int64x2_t vect_s64[4];
+    uint8x16_t vect_u8[4];
+    uint16x8_t vect_u16[4];
+    uint32x4_t vect_u32[4];
+    uint64x2_t vect_u64[4];
+    __m256i vect_i256[2];
+    __m128i vect_i128[4];
+} __m512i __attribute__((aligned(64)));
+
+typedef struct {
+    float32x4_t vect_f32[4];
+} __m512;
+
+typedef struct {
+    float64x2_t vect_f64[4];
+} __m512d;
+
+#define _MM_FROUND_TO_NEAREST_INT    0x00
+#define _MM_FROUND_TO_NEG_INF        0x01
+#define _MM_FROUND_TO_POS_INF        0x02
+#define _MM_FROUND_TO_ZERO           0x03
+#define _MM_FROUND_CUR_DIRECTION     0x04
+
+#define _MM_FROUND_RAISE_EXC         0x00
+#define _MM_FROUND_NO_EXC            0x08
+
+FORCE_INLINE __m512i _mm512_div_epi8(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_i128[0] = _mm_div_epi8(a.vect_i128[0], b.vect_i128[0]);
+    res_m512i.vect_i128[1] = _mm_div_epi8(a.vect_i128[1], b.vect_i128[1]);
+    res_m512i.vect_i128[2] = _mm_div_epi8(a.vect_i128[2], b.vect_i128[2]);
+    res_m512i.vect_i128[3] = _mm_div_epi8(a.vect_i128[3], b.vect_i128[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_div_epi16(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_i128[0] = _mm_div_epi16(a.vect_i128[0], b.vect_i128[0]);
+    res_m512i.vect_i128[1] = _mm_div_epi16(a.vect_i128[1], b.vect_i128[1]);
+    res_m512i.vect_i128[2] = _mm_div_epi16(a.vect_i128[2], b.vect_i128[2]);
+    res_m512i.vect_i128[3] = _mm_div_epi16(a.vect_i128[3], b.vect_i128[3]);
+    return res_m512i;
+}
+
+
+FORCE_INLINE __m512i _mm512_div_epi32(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_i256[0] = _mm256_div_epi32(a.vect_i256[0], b.vect_i256[0]);
+    res_m512i.vect_i256[1] = _mm256_div_epi32(a.vect_i256[1], b.vect_i256[1]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_div_epi64(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_i256[0] = _mm256_div_epi64(a.vect_i256[0], b.vect_i256[0]);
+    res_m512i.vect_i256[1] = _mm256_div_epi64(a.vect_i256[1], b.vect_i256[1]);
+    return res_m512i;
+}
+FORCE_INLINE __m512i _mm512_div_epu8(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_i128[0] = _mm_div_epu8(a.vect_i128[0], b.vect_i128[0]);
+    res_m512i.vect_i128[1] = _mm_div_epu8(a.vect_i128[1], b.vect_i128[1]);
+    res_m512i.vect_i128[2] = _mm_div_epu8(a.vect_i128[2], b.vect_i128[2]);
+    res_m512i.vect_i128[3] = _mm_div_epu8(a.vect_i128[3], b.vect_i128[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_div_epu16(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_i128[0] = _mm_div_epu16(a.vect_i128[0], b.vect_i128[0]);
+    res_m512i.vect_i128[1] = _mm_div_epu16(a.vect_i128[1], b.vect_i128[1]);
+    res_m512i.vect_i128[2] = _mm_div_epu16(a.vect_i128[2], b.vect_i128[2]);
+    res_m512i.vect_i128[3] = _mm_div_epu16(a.vect_i128[3], b.vect_i128[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_div_epu32(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_i256[0] = _mm256_div_epu32(a.vect_i256[0], b.vect_i256[0]);
+    res_m512i.vect_i256[1] = _mm256_div_epu32(a.vect_i256[1], b.vect_i256[1]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_div_epu64(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_i256[0] = _mm256_div_epu64(a.vect_i256[0], b.vect_i256[0]);
+    res_m512i.vect_i256[1] = _mm256_div_epu64(a.vect_i256[1], b.vect_i256[1]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512 _mm512_div_ps(__m512 a, __m512 b)
+{
+    __m512 res_m512;
+    res_m512.vect_f32[0] = vdivq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m512.vect_f32[1] = vdivq_f32(a.vect_f32[1], b.vect_f32[1]);
+    res_m512.vect_f32[2] = vdivq_f32(a.vect_f32[2], b.vect_f32[2]);
+    res_m512.vect_f32[3] = vdivq_f32(a.vect_f32[3], b.vect_f32[3]);
+    return res_m512;
+}
+
+FORCE_INLINE __m512d _mm512_div_pd(__m512d a, __m512d b)
+{
+    __m512d res_m512d;
+    res_m512d.vect_f64[0] = vdivq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m512d.vect_f64[1] = vdivq_f64(a.vect_f64[1], b.vect_f64[1]);
+    res_m512d.vect_f64[2] = vdivq_f64(a.vect_f64[2], b.vect_f64[2]);
+    res_m512d.vect_f64[3] = vdivq_f64(a.vect_f64[3], b.vect_f64[3]);
+    return res_m512d;
+}
+
+FORCE_INLINE __m512 _mm512_div_round_ps(__m512 a, __m512 b, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+    (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+    (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+    (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) || 
+    (rounding == _MM_FROUND_CUR_DIRECTION));
+    (void)rounding;
+    __m512 res_m512;
+    res_m512.vect_f32[0] = vdivq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m512.vect_f32[1] = vdivq_f32(a.vect_f32[1], b.vect_f32[1]);
+    res_m512.vect_f32[2] = vdivq_f32(a.vect_f32[2], b.vect_f32[2]);
+    res_m512.vect_f32[3] = vdivq_f32(a.vect_f32[3], b.vect_f32[3]);
+    return res_m512;
+}
+
+FORCE_INLINE __m512d _mm512_div_round_pd(__m512d a, __m512d b, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+    (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+    (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+    (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) || 
+    (rounding == _MM_FROUND_CUR_DIRECTION));
+    (void)rounding;
+    __m512d res_m512d;
+    res_m512d.vect_f64[0] = vdivq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m512d.vect_f64[1] = vdivq_f64(a.vect_f64[1], b.vect_f64[1]);
+    res_m512d.vect_f64[2] = vdivq_f64(a.vect_f64[2], b.vect_f64[2]);
+    res_m512d.vect_f64[3] = vdivq_f64(a.vect_f64[3], b.vect_f64[3]);
+    return res_m512d;
+}
+
+
+FORCE_INLINE __m512i _mm512_add_epi8(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s8[0] = vaddq_s8(a.vect_s8[0], b.vect_s8[0]);
+    res_m512i.vect_s8[1] = vaddq_s8(a.vect_s8[1], b.vect_s8[1]);
+    res_m512i.vect_s8[2] = vaddq_s8(a.vect_s8[2], b.vect_s8[2]);
+    res_m512i.vect_s8[3] = vaddq_s8(a.vect_s8[3], b.vect_s8[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_add_epi16(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s16[0] = vaddq_s16(a.vect_s16[0], b.vect_s16[0]);
+    res_m512i.vect_s16[1] = vaddq_s16(a.vect_s16[1], b.vect_s16[1]);
+    res_m512i.vect_s16[2] = vaddq_s16(a.vect_s16[2], b.vect_s16[2]);
+    res_m512i.vect_s16[3] = vaddq_s16(a.vect_s16[3], b.vect_s16[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_add_epi32(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s32[0] = vaddq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m512i.vect_s32[1] = vaddq_s32(a.vect_s32[1], b.vect_s32[1]);
+    res_m512i.vect_s32[2] = vaddq_s32(a.vect_s32[2], b.vect_s32[2]);
+    res_m512i.vect_s32[3] = vaddq_s32(a.vect_s32[3], b.vect_s32[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_add_epi64(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s64[0] = vaddq_s64(a.vect_s64[0], b.vect_s64[0]);
+    res_m512i.vect_s64[1] = vaddq_s64(a.vect_s64[1], b.vect_s64[1]);
+    res_m512i.vect_s64[2] = vaddq_s64(a.vect_s64[2], b.vect_s64[2]);
+    res_m512i.vect_s64[3] = vaddq_s64(a.vect_s64[3], b.vect_s64[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_adds_epi8(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s8[0] = vqaddq_s8(a.vect_s8[0], b.vect_s8[0]);
+    res_m512i.vect_s8[1] = vqaddq_s8(a.vect_s8[1], b.vect_s8[1]);
+    res_m512i.vect_s8[2] = vqaddq_s8(a.vect_s8[2], b.vect_s8[2]);
+    res_m512i.vect_s8[3] = vqaddq_s8(a.vect_s8[3], b.vect_s8[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_adds_epi16(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s16[0] = vqaddq_s16(a.vect_s16[0], b.vect_s16[0]);
+    res_m512i.vect_s16[1] = vqaddq_s16(a.vect_s16[1], b.vect_s16[1]);
+    res_m512i.vect_s16[2] = vqaddq_s16(a.vect_s16[2], b.vect_s16[2]);
+    res_m512i.vect_s16[3] = vqaddq_s16(a.vect_s16[3], b.vect_s16[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_adds_epu8(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_u8[0] = vqaddq_u8(a.vect_u8[0], b.vect_u8[0]);
+    res_m512i.vect_u8[1] = vqaddq_u8(a.vect_u8[1], b.vect_u8[1]);
+    res_m512i.vect_u8[2] = vqaddq_u8(a.vect_u8[2], b.vect_u8[2]);
+    res_m512i.vect_u8[3] = vqaddq_u8(a.vect_u8[3], b.vect_u8[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_adds_epu16(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_u16[0] = vqaddq_u16(a.vect_u16[0], b.vect_u16[0]);
+    res_m512i.vect_u16[1] = vqaddq_u16(a.vect_u16[1], b.vect_u16[1]);
+    res_m512i.vect_u16[2] = vqaddq_u16(a.vect_u16[2], b.vect_u16[2]);
+    res_m512i.vect_u16[3] = vqaddq_u16(a.vect_u16[3], b.vect_u16[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512 _mm512_add_ps(__m512 a, __m512 b)
+{
+    __m512 res_m512;
+    res_m512.vect_f32[0] = vaddq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m512.vect_f32[1] = vaddq_f32(a.vect_f32[1], b.vect_f32[1]);
+    res_m512.vect_f32[2] = vaddq_f32(a.vect_f32[2], b.vect_f32[2]);
+    res_m512.vect_f32[3] = vaddq_f32(a.vect_f32[3], b.vect_f32[3]);
+    return res_m512;
+}
+
+FORCE_INLINE __m512d _mm512_add_pd(__m512d a, __m512d b)
+{
+    __m512d res_m512d;
+    res_m512d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m512d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+    res_m512d.vect_f64[2] = vaddq_f64(a.vect_f64[2], b.vect_f64[2]);
+    res_m512d.vect_f64[3] = vaddq_f64(a.vect_f64[3], b.vect_f64[3]);
+    return res_m512d;
+}
+
+FORCE_INLINE __m512 _mm512_add_round_ps (__m512 a, __m512 b, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == (_MM_FROUND_CUR_DIRECTION)));
+    (void)rounding;
+    a.vect_f32[0] = vaddq_f32(a.vect_f32[0], b.vect_f32[0]);
+    a.vect_f32[1] = vaddq_f32(a.vect_f32[1], b.vect_f32[1]);
+    a.vect_f32[2] = vaddq_f32(a.vect_f32[2], b.vect_f32[2]);
+    a.vect_f32[3] = vaddq_f32(a.vect_f32[3], b.vect_f32[3]);
+    return a;
+}
+
+FORCE_INLINE __m512d _mm512_add_round_pd (__m512d a, __m512d b, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == (_MM_FROUND_CUR_DIRECTION)));
+    (void)rounding;
+    a.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+    a.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+    a.vect_f64[2] = vaddq_f64(a.vect_f64[2], b.vect_f64[2]);
+    a.vect_f64[3] = vaddq_f64(a.vect_f64[3], b.vect_f64[3]);
+    return a;
+}
+
+FORCE_INLINE __m512 _mm512_addn_ps (__m512 a, __m512 b)
+{
+    a.vect_f32[0] = vnegq_f32(vaddq_f32(a.vect_f32[0], b.vect_f32[0]));
+    a.vect_f32[1] = vnegq_f32(vaddq_f32(a.vect_f32[1], b.vect_f32[1]));
+    a.vect_f32[2] = vnegq_f32(vaddq_f32(a.vect_f32[2], b.vect_f32[2]));
+    a.vect_f32[3] = vnegq_f32(vaddq_f32(a.vect_f32[3], b.vect_f32[3]));
+    return a;
+}
+
+FORCE_INLINE __m512d _mm512_addn_pd (__m512d a, __m512d b)
+{
+    a.vect_f64[0] = vnegq_f64(vaddq_f64(a.vect_f64[0], b.vect_f64[0]));
+    a.vect_f64[1] = vnegq_f64(vaddq_f64(a.vect_f64[1], b.vect_f64[1]));
+    a.vect_f64[2] = vnegq_f64(vaddq_f64(a.vect_f64[2], b.vect_f64[2]));
+    a.vect_f64[3] = vnegq_f64(vaddq_f64(a.vect_f64[3], b.vect_f64[3]));
+    return a;
+}
+
+FORCE_INLINE __m512 _mm512_addn_round_ps (__m512 a, __m512 b, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == (_MM_FROUND_CUR_DIRECTION)));
+    (void)rounding;
+    a.vect_f32[0] = vnegq_f32(vaddq_f32(a.vect_f32[0], b.vect_f32[0]));
+    a.vect_f32[1] = vnegq_f32(vaddq_f32(a.vect_f32[1], b.vect_f32[1]));
+    a.vect_f32[2] = vnegq_f32(vaddq_f32(a.vect_f32[2], b.vect_f32[2]));
+    a.vect_f32[3] = vnegq_f32(vaddq_f32(a.vect_f32[3], b.vect_f32[3]));
+    return a;
+}
+
+FORCE_INLINE __m512d _mm512_addn_round_pd (__m512d a, __m512d b, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == (_MM_FROUND_CUR_DIRECTION)));
+    (void)rounding;
+    a.vect_f64[0] = vnegq_f64(vaddq_f64(a.vect_f64[0], b.vect_f64[0]));
+    a.vect_f64[1] = vnegq_f64(vaddq_f64(a.vect_f64[1], b.vect_f64[1]));
+    a.vect_f64[2] = vnegq_f64(vaddq_f64(a.vect_f64[2], b.vect_f64[2]));
+    a.vect_f64[3] = vnegq_f64(vaddq_f64(a.vect_f64[3], b.vect_f64[3]));
+    return a;
+}
+
+FORCE_INLINE __m512i _mm512_addsetc_epi32 (__m512i v2, __m512i v3, __mmask16 *k2_res)
+{
+    __m512i res, carry;
+    res.vect_u32[0] = vaddq_u32(v2.vect_u32[0], v3.vect_u32[0]);
+    res.vect_u32[1] = vaddq_u32(v2.vect_u32[1], v3.vect_u32[1]);
+    res.vect_u32[2] = vaddq_u32(v2.vect_u32[2], v3.vect_u32[2]);
+    res.vect_u32[3] = vaddq_u32(v2.vect_u32[3], v3.vect_u32[3]);
+    carry.vect_u32[0] = vcltq_u32(res.vect_u32[0], v3.vect_u32[0]);
+    carry.vect_u32[1] = vcltq_u32(res.vect_u32[1], v3.vect_u32[1]);
+    carry.vect_u32[2] = vcltq_u32(res.vect_u32[2], v3.vect_u32[2]);
+    carry.vect_u32[3] = vcltq_u32(res.vect_u32[3], v3.vect_u32[3]);
+    PICK_HB_32x16(carry, k2_res);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_addsets_epi32 (__m512i v2, __m512i v3, __mmask16 *sign)
+{
+    __m512i res, tmp;
+    res.vect_s32[0] = vaddq_s32(v2.vect_s32[0], v3.vect_s32[0]);
+    res.vect_s32[1] = vaddq_s32(v2.vect_s32[1], v3.vect_s32[1]);
+    res.vect_s32[2] = vaddq_s32(v2.vect_s32[2], v3.vect_s32[2]);
+    res.vect_s32[3] = vaddq_s32(v2.vect_s32[3], v3.vect_s32[3]);
+    tmp.vect_u32[0] = vandq_u32(v2.vect_u32[0], v3.vect_u32[0]);
+    tmp.vect_u32[1] = vandq_u32(v2.vect_u32[1], v3.vect_u32[1]);
+    tmp.vect_u32[2] = vandq_u32(v2.vect_u32[2], v3.vect_u32[2]);
+    tmp.vect_u32[3] = vandq_u32(v2.vect_u32[3], v3.vect_u32[3]);
+    PICK_HB_32x16(tmp, sign);
+    return res;
+}
+
+FORCE_INLINE __m512 _mm512_addsets_ps (__m512 v2, __m512 v3, __mmask16 *sign)
+{
+    __m512 res;
+    __m512i tmp;
+    res.vect_f32[0] = vaddq_f32(v2.vect_f32[0], v3.vect_f32[0]);
+    res.vect_f32[1] = vaddq_f32(v2.vect_f32[1], v3.vect_f32[1]);
+    res.vect_f32[2] = vaddq_f32(v2.vect_f32[2], v3.vect_f32[2]);
+    res.vect_f32[3] = vaddq_f32(v2.vect_f32[3], v3.vect_f32[3]);
+    tmp.vect_u32[0] = vandq_u32(vreinterpretq_u32_f32(v2.vect_f32[0]), vreinterpretq_u32_f32(v3.vect_f32[0]));
+    tmp.vect_u32[1] = vandq_u32(vreinterpretq_u32_f32(v2.vect_f32[1]), vreinterpretq_u32_f32(v3.vect_f32[1]));
+    tmp.vect_u32[2] = vandq_u32(vreinterpretq_u32_f32(v2.vect_f32[2]), vreinterpretq_u32_f32(v3.vect_f32[2]));
+    tmp.vect_u32[3] = vandq_u32(vreinterpretq_u32_f32(v2.vect_f32[3]), vreinterpretq_u32_f32(v3.vect_f32[3]));
+    PICK_HB_32x16(tmp, sign);
+    return res;
+}
+
+FORCE_INLINE __m512 _mm512_addsets_round_ps (__m512 v2, __m512 v3, __mmask16 *sign, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == (_MM_FROUND_CUR_DIRECTION)));
+    (void)rounding;
+    return _mm512_addsets_ps(v2, v3, sign);
+}
+
+FORCE_INLINE __m512i _mm512_sub_epi16(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s16[0] = vsubq_s16(a.vect_s16[0], b.vect_s16[0]);
+    res_m512i.vect_s16[1] = vsubq_s16(a.vect_s16[1], b.vect_s16[1]);
+    res_m512i.vect_s16[2] = vsubq_s16(a.vect_s16[2], b.vect_s16[2]);
+    res_m512i.vect_s16[3] = vsubq_s16(a.vect_s16[3], b.vect_s16[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_sub_epi32(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s32[0] = vsubq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m512i.vect_s32[1] = vsubq_s32(a.vect_s32[1], b.vect_s32[1]);
+    res_m512i.vect_s32[2] = vsubq_s32(a.vect_s32[2], b.vect_s32[2]);
+    res_m512i.vect_s32[3] = vsubq_s32(a.vect_s32[3], b.vect_s32[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_sub_epi64(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s64[0] = vsubq_s64(a.vect_s64[0], b.vect_s64[0]);
+    res_m512i.vect_s64[1] = vsubq_s64(a.vect_s64[1], b.vect_s64[1]);
+    res_m512i.vect_s64[2] = vsubq_s64(a.vect_s64[2], b.vect_s64[2]);
+    res_m512i.vect_s64[3] = vsubq_s64(a.vect_s64[3], b.vect_s64[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_sub_epi8(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s8[0] = vsubq_s8(a.vect_s8[0], b.vect_s8[0]);
+    res_m512i.vect_s8[1] = vsubq_s8(a.vect_s8[1], b.vect_s8[1]);
+    res_m512i.vect_s8[2] = vsubq_s8(a.vect_s8[2], b.vect_s8[2]);
+    res_m512i.vect_s8[3] = vsubq_s8(a.vect_s8[3], b.vect_s8[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512d _mm512_sub_pd(__m512d a, __m512d b)
+{
+    __m512d res_m512d;
+    res_m512d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m512d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+    res_m512d.vect_f64[2] = vsubq_f64(a.vect_f64[2], b.vect_f64[2]);
+    res_m512d.vect_f64[3] = vsubq_f64(a.vect_f64[3], b.vect_f64[3]);
+    return res_m512d;
+}
+
+FORCE_INLINE __m512 _mm512_sub_ps(__m512 a, __m512 b)
+{
+    __m512 res_m512;
+    res_m512.vect_f32[0] = vsubq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m512.vect_f32[1] = vsubq_f32(a.vect_f32[1], b.vect_f32[1]);
+    res_m512.vect_f32[2] = vsubq_f32(a.vect_f32[2], b.vect_f32[2]);
+    res_m512.vect_f32[3] = vsubq_f32(a.vect_f32[3], b.vect_f32[3]);
+    return res_m512;
+}
+
+FORCE_INLINE __m512d _mm512_sub_round_pd(__m512d a, __m512d b, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == _MM_FROUND_CUR_DIRECTION));
+    (void)rounding;
+    __m512d res_m512d;
+    res_m512d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m512d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+    res_m512d.vect_f64[2] = vsubq_f64(a.vect_f64[2], b.vect_f64[2]);
+    res_m512d.vect_f64[3] = vsubq_f64(a.vect_f64[3], b.vect_f64[3]);
+    return res_m512d;
+}
+
+FORCE_INLINE __m512 _mm512_sub_round_ps(__m512 a, __m512 b, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == _MM_FROUND_CUR_DIRECTION));
+    (void)rounding;
+    __m512 res_m512;
+    res_m512.vect_f32[0] = vsubq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m512.vect_f32[1] = vsubq_f32(a.vect_f32[1], b.vect_f32[1]);
+    res_m512.vect_f32[2] = vsubq_f32(a.vect_f32[2], b.vect_f32[2]);
+    res_m512.vect_f32[3] = vsubq_f32(a.vect_f32[3], b.vect_f32[3]);
+    return res_m512;
+}
+
+FORCE_INLINE __m512i _mm512_subr_epi32 (__m512i v2, __m512i v3)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s32[0] = vsubq_s32(v3.vect_s32[0], v2.vect_s32[0]);
+    res_m512i.vect_s32[1] = vsubq_s32(v3.vect_s32[1], v2.vect_s32[1]);
+    res_m512i.vect_s32[2] = vsubq_s32(v3.vect_s32[2], v2.vect_s32[2]);
+    res_m512i.vect_s32[3] = vsubq_s32(v3.vect_s32[3], v2.vect_s32[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512 _mm512_subr_ps (__m512 v2, __m512 v3)
+{
+    __m512 res_m512;
+    res_m512.vect_f32[0] = vsubq_f32(v3.vect_f32[0], v2.vect_f32[0]);
+    res_m512.vect_f32[1] = vsubq_f32(v3.vect_f32[1], v2.vect_f32[1]);
+    res_m512.vect_f32[2] = vsubq_f32(v3.vect_f32[2], v2.vect_f32[2]);
+    res_m512.vect_f32[3] = vsubq_f32(v3.vect_f32[3], v2.vect_f32[3]);
+    return res_m512;
+}
+
+FORCE_INLINE __m512d _mm512_subr_pd (__m512d v2, __m512d v3)
+{
+    __m512d res_m512d;
+    res_m512d.vect_f64[0] = vsubq_f64(v3.vect_f64[0], v2.vect_f64[0]);
+    res_m512d.vect_f64[1] = vsubq_f64(v3.vect_f64[1], v2.vect_f64[1]);
+    res_m512d.vect_f64[2] = vsubq_f64(v3.vect_f64[2], v2.vect_f64[2]);
+    res_m512d.vect_f64[3] = vsubq_f64(v3.vect_f64[3], v2.vect_f64[3]);
+    return res_m512d;
+}
+
+FORCE_INLINE __m512 _mm512_subr_round_ps (__m512 v2, __m512 v3, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == _MM_FROUND_CUR_DIRECTION));
+    (void)rounding;
+    return _mm512_subr_ps(v2, v3);
+}
+
+FORCE_INLINE __m512d _mm512_subr_round_pd (__m512d v2, __m512d v3, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == _MM_FROUND_CUR_DIRECTION));
+    (void)rounding;
+    return _mm512_subr_pd(v2, v3);
+}
+
+FORCE_INLINE __m512i _mm512_subsetb_epi32 (__m512i v2, __m512i v3, __mmask16 *borrow)
+{
+    __m512i res, carry;
+    res.vect_s32[0] = vsubq_s32(v2.vect_s32[0], v3.vect_s32[0]);
+    res.vect_s32[1] = vsubq_s32(v2.vect_s32[1], v3.vect_s32[1]);
+    res.vect_s32[2] = vsubq_s32(v2.vect_s32[2], v3.vect_s32[2]);
+    res.vect_s32[3] = vsubq_s32(v2.vect_s32[3], v3.vect_s32[3]);
+    carry.vect_u32[0] = vcltq_u32(v2.vect_u32[0], v3.vect_u32[0]);
+    carry.vect_u32[1] = vcltq_u32(v2.vect_u32[1], v3.vect_u32[1]);
+    carry.vect_u32[2] = vcltq_u32(v2.vect_u32[2], v3.vect_u32[2]);
+    carry.vect_u32[3] = vcltq_u32(v2.vect_u32[3], v3.vect_u32[3]);
+    PICK_HB_32x16(carry, borrow);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_subrsetb_epi32 (__m512i v2, __m512i v3, __mmask16 *borrow)
+{
+    return _mm512_subsetb_epi32(v3, v2, borrow);
+}
+
+FORCE_INLINE __m512i _mm512_subs_epi16(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s16[0] = vqsubq_s16(a.vect_s16[0], b.vect_s16[0]);
+    res_m512i.vect_s16[1] = vqsubq_s16(a.vect_s16[1], b.vect_s16[1]);
+    res_m512i.vect_s16[2] = vqsubq_s16(a.vect_s16[2], b.vect_s16[2]);
+    res_m512i.vect_s16[3] = vqsubq_s16(a.vect_s16[3], b.vect_s16[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_subs_epi8(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s8[0] = vqsubq_s8(a.vect_s8[0], b.vect_s8[0]);
+    res_m512i.vect_s8[1] = vqsubq_s8(a.vect_s8[1], b.vect_s8[1]);
+    res_m512i.vect_s8[2] = vqsubq_s8(a.vect_s8[2], b.vect_s8[2]);
+    res_m512i.vect_s8[3] = vqsubq_s8(a.vect_s8[3], b.vect_s8[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_subs_epu16(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_u16[0] = vqsubq_u16(a.vect_u16[0], b.vect_u16[0]);
+    res_m512i.vect_u16[1] = vqsubq_u16(a.vect_u16[1], b.vect_u16[1]);
+    res_m512i.vect_u16[2] = vqsubq_u16(a.vect_u16[2], b.vect_u16[2]);
+    res_m512i.vect_u16[3] = vqsubq_u16(a.vect_u16[3], b.vect_u16[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_subs_epu8(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_u8[0] = vqsubq_u8(a.vect_u8[0], b.vect_u8[0]);
+    res_m512i.vect_u8[1] = vqsubq_u8(a.vect_u8[1], b.vect_u8[1]);
+    res_m512i.vect_u8[2] = vqsubq_u8(a.vect_u8[2], b.vect_u8[2]);
+    res_m512i.vect_u8[3] = vqsubq_u8(a.vect_u8[3], b.vect_u8[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_permutexvar_epi32 (__m512i idx, __m512i a)
+{
+    __m512i res;
+    int32x4_t low4bit = vdupq_n_s32(0x0f);
+    idx.vect_s32[0] = vandq_s32(idx.vect_s32[0], low4bit);
+    idx.vect_s32[1] = vandq_s32(idx.vect_s32[1], low4bit);
+    idx.vect_s32[2] = vandq_s32(idx.vect_s32[2], low4bit);
+    idx.vect_s32[3] = vandq_s32(idx.vect_s32[3], low4bit);
+    int32_t p_a[16], p_i[16];
+    vst1q_s32(p_a, a.vect_s32[0]);
+    vst1q_s32(p_a + 4, a.vect_s32[1]);
+    vst1q_s32(p_a + 8, a.vect_s32[2]);
+    vst1q_s32(p_a + 12, a.vect_s32[3]);
+    vst1q_s32(p_i, idx.vect_s32[0]);
+    vst1q_s32(p_i + 4, idx.vect_s32[1]);
+    vst1q_s32(p_i + 8, idx.vect_s32[2]);
+    vst1q_s32(p_i + 12, idx.vect_s32[3]);
+    res.vect_s32[0] = vsetq_lane_s32(p_a[p_i[0]], res.vect_s32[0], 0);
+    res.vect_s32[0] = vsetq_lane_s32(p_a[p_i[1]], res.vect_s32[0], 1);
+    res.vect_s32[0] = vsetq_lane_s32(p_a[p_i[2]], res.vect_s32[0], 2);
+    res.vect_s32[0] = vsetq_lane_s32(p_a[p_i[3]], res.vect_s32[0], 3);
+    res.vect_s32[1] = vsetq_lane_s32(p_a[p_i[4]], res.vect_s32[1], 0);
+    res.vect_s32[1] = vsetq_lane_s32(p_a[p_i[5]], res.vect_s32[1], 1);
+    res.vect_s32[1] = vsetq_lane_s32(p_a[p_i[6]], res.vect_s32[1], 2);
+    res.vect_s32[1] = vsetq_lane_s32(p_a[p_i[7]], res.vect_s32[1], 3);
+    res.vect_s32[2] = vsetq_lane_s32(p_a[p_i[8]], res.vect_s32[2], 0);
+    res.vect_s32[2] = vsetq_lane_s32(p_a[p_i[9]], res.vect_s32[2], 1);
+    res.vect_s32[2] = vsetq_lane_s32(p_a[p_i[10]], res.vect_s32[2], 2);
+    res.vect_s32[2] = vsetq_lane_s32(p_a[p_i[11]], res.vect_s32[2], 3);
+    res.vect_s32[3] = vsetq_lane_s32(p_a[p_i[12]], res.vect_s32[3], 0);
+    res.vect_s32[3] = vsetq_lane_s32(p_a[p_i[13]], res.vect_s32[3], 1);
+    res.vect_s32[3] = vsetq_lane_s32(p_a[p_i[14]], res.vect_s32[3], 2);
+    res.vect_s32[3] = vsetq_lane_s32(p_a[p_i[15]], res.vect_s32[3], 3);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_permutexvar_epi64(__m512i idx, __m512i a)
+{
+    __m512i res;
+    int64x2_t low3bit = vdupq_n_s64(0x07);
+    idx.vect_s64[0] = vandq_s64(idx.vect_s64[0], low3bit);
+    idx.vect_s64[1] = vandq_s64(idx.vect_s64[1], low3bit);
+    idx.vect_s64[2] = vandq_s64(idx.vect_s64[2], low3bit);
+    idx.vect_s64[3] = vandq_s64(idx.vect_s64[3], low3bit);
+    int64_t p_a[8], p_i[8];
+    vst1q_s64(p_a, a.vect_s64[0]);
+    vst1q_s64(p_a + 2, a.vect_s64[1]);
+    vst1q_s64(p_a + 4, a.vect_s64[2]);
+    vst1q_s64(p_a + 6, a.vect_s64[3]);
+    vst1q_s64(p_i, idx.vect_s64[0]);
+    vst1q_s64(p_i + 2, idx.vect_s64[1]);
+    vst1q_s64(p_i + 4, idx.vect_s64[2]);
+    vst1q_s64(p_i + 6, idx.vect_s64[3]);
+    res.vect_s64[0] = vsetq_lane_s64(p_a[p_i[0]], res.vect_s64[0], 0);
+    res.vect_s64[0] = vsetq_lane_s64(p_a[p_i[1]], res.vect_s64[0], 1);
+    res.vect_s64[1] = vsetq_lane_s64(p_a[p_i[2]], res.vect_s64[1], 0);
+    res.vect_s64[1] = vsetq_lane_s64(p_a[p_i[3]], res.vect_s64[1], 1);
+    res.vect_s64[2] = vsetq_lane_s64(p_a[p_i[4]], res.vect_s64[2], 0);
+    res.vect_s64[2] = vsetq_lane_s64(p_a[p_i[5]], res.vect_s64[2], 1);
+    res.vect_s64[3] = vsetq_lane_s64(p_a[p_i[6]], res.vect_s64[3], 0);
+    res.vect_s64[3] = vsetq_lane_s64(p_a[p_i[7]], res.vect_s64[3], 1);
+    return res;
+}
+
+FORCE_INLINE __mmask64 _mm512_test_epi8_mask(__m512i a, __m512i b)
+{
+    uint8x16_t mask_and = vld1q_u8(g_mask_epi8);
+    __m512i tmp;
+    tmp.vect_u8[0] = vandq_u8(vtstq_u8(a.vect_u8[0], b.vect_u8[0]), mask_and);
+    tmp.vect_u8[1] = vandq_u8(vtstq_u8(a.vect_u8[1], b.vect_u8[1]), mask_and);
+    tmp.vect_u8[2] = vandq_u8(vtstq_u8(a.vect_u8[2], b.vect_u8[2]), mask_and);
+    tmp.vect_u8[3] = vandq_u8(vtstq_u8(a.vect_u8[3], b.vect_u8[3]), mask_and);
+    uint8_t r[8];
+    __asm__ __volatile__ (
+        "addv %b[r0], %[t0].8b              \n\t"
+        "addv %b[r2], %[t1].8b              \n\t"
+        "addv %b[r4], %[t2].8b              \n\t"
+        "addv %b[r6], %[t3].8b              \n\t"
+        "ins %[t0].d[0], %[t0].d[1]         \n\t"
+        "ins %[t1].d[0], %[t1].d[1]         \n\t"
+        "ins %[t2].d[0], %[t2].d[1]         \n\t"
+        "ins %[t3].d[0], %[t3].d[1]         \n\t"
+        "addv %b[r1], %[t0].8b              \n\t"
+        "addv %b[r3], %[t1].8b              \n\t"
+        "addv %b[r5], %[t2].8b              \n\t"
+        "addv %b[r7], %[t3].8b              \n\t"
+        :[r0]"=w"(r[0]), [r1]"=w"(r[1]), [r2]"=w"(r[2]), [r3]"=w"(r[3]), [r4]"=w"(r[4]), [r5]"=w"(r[5]), [r6]"=w"(r[6]),
+         [r7]"=w"(r[7]), 
+         [t0]"+w"(tmp.vect_u8[0]), [t1]"+w"(tmp.vect_u8[1]), [t2]"+w"(tmp.vect_u8[2]), [t3]"+w"(tmp.vect_u8[3])
+    );
+    uint64x1_t res = vreinterpret_u64_u8(vld1_u8((const uint8_t *)r));
+    return vget_lane_u64(res, 0);
+}
+
+FORCE_INLINE __mmask16 _mm512_test_epi32_mask(__m512i a, __m512i b)
+{
+    uint32x4_t mask_and = vld1q_u32(g_mask_epi32);
+    __m512i tmp;
+    tmp.vect_u32[0] = vandq_u32(vtstq_u32(a.vect_u32[0], b.vect_u32[0]), mask_and);
+    tmp.vect_u32[1] = vandq_u32(vtstq_u32(a.vect_u32[1], b.vect_u32[1]), mask_and);
+    tmp.vect_u32[2] = vandq_u32(vtstq_u32(a.vect_u32[2], b.vect_u32[2]), mask_and);
+    tmp.vect_u32[3] = vandq_u32(vtstq_u32(a.vect_u32[3], b.vect_u32[3]), mask_and);
+    uint32_t r0 = vaddvq_u32(tmp.vect_u32[0]);
+    uint32_t r1 = vaddvq_u32(tmp.vect_u32[1]);
+    uint32_t r2 = vaddvq_u32(tmp.vect_u32[2]);
+    uint32_t r3 = vaddvq_u32(tmp.vect_u32[3]);
+    __mmask16 res = r0 | (r1 << 4) | (r2 << 8) | (r3 << 12);
+    return res;
+}
+
+FORCE_INLINE __mmask8 _mm512_test_epi64_mask(__m512i a, __m512i b)
+{
+    uint64x2_t mask_and = vld1q_u64(g_mask_epi64);
+    __m512i tmp;
+    tmp.vect_u64[0] = vandq_u64(vtstq_u64(a.vect_u64[0], b.vect_u64[0]), mask_and);
+    tmp.vect_u64[1] = vandq_u64(vtstq_u64(a.vect_u64[1], b.vect_u64[1]), mask_and);
+    tmp.vect_u64[2] = vandq_u64(vtstq_u64(a.vect_u64[2], b.vect_u64[2]), mask_and);
+    tmp.vect_u64[3] = vandq_u64(vtstq_u64(a.vect_u64[3], b.vect_u64[3]), mask_and);
+    uint32_t r0 = vaddvq_u32(tmp.vect_u32[0]);
+    uint32_t r1 = vaddvq_u32(tmp.vect_u32[1]);
+    uint32_t r2 = vaddvq_u32(tmp.vect_u32[2]);
+    uint32_t r3 = vaddvq_u32(tmp.vect_u32[3]);
+    __mmask8 res = r0 | (r1 << 2) | (r2 << 4) | (r3 << 6);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_mul_epi32(__m512i a, __m512i b)
+{
+    __asm__ __volatile__ (
+        "ins %[a0].s[1], %[a0].s[2]             \n\t"
+        "ins %[a1].s[1], %[a1].s[2]             \n\t"
+        "ins %[a2].s[1], %[a2].s[2]             \n\t"
+        "ins %[a3].s[1], %[a3].s[2]             \n\t"
+        "ins %[b0].s[1], %[b0].s[2]             \n\t"
+        "ins %[b1].s[1], %[b1].s[2]             \n\t"
+        "ins %[b2].s[1], %[b2].s[2]             \n\t"
+        "ins %[b3].s[1], %[b3].s[2]             \n\t"
+        "smull %[a0].2d, %[a0].2s, %[b0].2s     \n\t"
+        "smull %[a1].2d, %[a1].2s, %[b1].2s     \n\t"
+        "smull %[a2].2d, %[a2].2s, %[b2].2s     \n\t"
+        "smull %[a3].2d, %[a3].2s, %[b3].2s     \n\t"
+        :[a0]"+w"(a.vect_s32[0]), [a1]"+w"(a.vect_s32[1]), [a2]"+w"(a.vect_s32[2]), [a3]"+w"(a.vect_s32[3]), 
+         [b0]"+w"(b.vect_s32[0]), [b1]"+w"(b.vect_s32[1]), [b2]"+w"(b.vect_s32[2]), [b3]"+w"(b.vect_s32[3])
+        :
+        :
+    );
+    return a;
+}
+
+FORCE_INLINE __m512i _mm512_mul_epu32(__m512i a, __m512i b)
+{
+    __asm__ __volatile__ (
+        "ins %[a0].s[1], %[a0].s[2]             \n\t"
+        "ins %[a1].s[1], %[a1].s[2]             \n\t"
+        "ins %[a2].s[1], %[a2].s[2]             \n\t"
+        "ins %[a3].s[1], %[a3].s[2]             \n\t"
+        "ins %[b0].s[1], %[b0].s[2]             \n\t"
+        "ins %[b1].s[1], %[b1].s[2]             \n\t"
+        "ins %[b2].s[1], %[b2].s[2]             \n\t"
+        "ins %[b3].s[1], %[b3].s[2]             \n\t"
+        "umull %[a0].2d, %[a0].2s, %[b0].2s     \n\t"
+        "umull %[a1].2d, %[a1].2s, %[b1].2s     \n\t"
+        "umull %[a2].2d, %[a2].2s, %[b2].2s     \n\t"
+        "umull %[a3].2d, %[a3].2s, %[b3].2s     \n\t"
+        :[a0]"+w"(a.vect_u32[0]), [a1]"+w"(a.vect_u32[1]), [a2]"+w"(a.vect_u32[2]), [a3]"+w"(a.vect_u32[3]), 
+         [b0]"+w"(b.vect_u32[0]), [b1]"+w"(b.vect_u32[1]), [b2]"+w"(b.vect_u32[2]), [b3]"+w"(b.vect_u32[3])
+        :
+        :
+    );
+    return a;
+}
+
+FORCE_INLINE __m512d _mm512_mul_pd(__m512d a, __m512d b)
+{
+    __m512d res_m512d;
+    res_m512d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m512d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+    res_m512d.vect_f64[2] = vmulq_f64(a.vect_f64[2], b.vect_f64[2]);
+    res_m512d.vect_f64[3] = vmulq_f64(a.vect_f64[3], b.vect_f64[3]);
+    return res_m512d;
+}
+
+FORCE_INLINE __m512 _mm512_mul_ps(__m512 a, __m512 b)
+{
+    __m512 res_m512;
+    res_m512.vect_f32[0] = vmulq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m512.vect_f32[1] = vmulq_f32(a.vect_f32[1], b.vect_f32[1]);
+    res_m512.vect_f32[2] = vmulq_f32(a.vect_f32[2], b.vect_f32[2]);
+    res_m512.vect_f32[3] = vmulq_f32(a.vect_f32[3], b.vect_f32[3]);
+    return res_m512;
+}
+
+FORCE_INLINE __m512i _mm512_mulhi_epi16(__m512i a, __m512i b)
+{
+    __m512i res;
+    res.vect_i256[0] = _mm256_mulhi_epi16(a.vect_i256[0], b.vect_i256[0]);
+    res.vect_i256[1] = _mm256_mulhi_epi16(a.vect_i256[1], b.vect_i256[1]);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_mulhi_epu16(__m512i a, __m512i b)
+{
+    __m512i res;
+    res.vect_i256[0] = _mm256_mulhi_epu16(a.vect_i256[0], b.vect_i256[0]);
+    res.vect_i256[1] = _mm256_mulhi_epu16(a.vect_i256[1], b.vect_i256[1]);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_mulhi_epi32 (__m512i a, __m512i b)
+{
+    __m512i res;
+    res.vect_i256[0] = _mm256_mulhi_epi32(a.vect_i256[0], b.vect_i256[0]);
+    res.vect_i256[1] = _mm256_mulhi_epi32(a.vect_i256[1], b.vect_i256[1]);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_mulhi_epu32 (__m512i a, __m512i b)
+{
+    __m512i res;
+    res.vect_i256[0] = _mm256_mulhi_epu32(a.vect_i256[0], b.vect_i256[0]);
+    res.vect_i256[1] = _mm256_mulhi_epu32(a.vect_i256[1], b.vect_i256[1]);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_mullo_epi16(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s16[0] = vmulq_s16(a.vect_s16[0], b.vect_s16[0]);
+    res_m512i.vect_s16[1] = vmulq_s16(a.vect_s16[1], b.vect_s16[1]);
+    res_m512i.vect_s16[2] = vmulq_s16(a.vect_s16[2], b.vect_s16[2]);
+    res_m512i.vect_s16[3] = vmulq_s16(a.vect_s16[3], b.vect_s16[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_mullo_epi32(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s32[0] = vmulq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m512i.vect_s32[1] = vmulq_s32(a.vect_s32[1], b.vect_s32[1]);
+    res_m512i.vect_s32[2] = vmulq_s32(a.vect_s32[2], b.vect_s32[2]);
+    res_m512i.vect_s32[3] = vmulq_s32(a.vect_s32[3], b.vect_s32[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_mullo_epi64(__m512i a, __m512i b)
+{
+    __m512i res;
+    res.vect_i256[0] = _mm256_mullo_epi64(a.vect_i256[0], b.vect_i256[0]);
+    res.vect_i256[1] = _mm256_mullo_epi64(a.vect_i256[1], b.vect_i256[1]);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_mullox_epi64(__m512i a, __m512i b)
+{
+    __m512i res;
+    res.vect_i256[0] = _mm256_mullo_epi64(a.vect_i256[0], b.vect_i256[0]);
+    res.vect_i256[1] = _mm256_mullo_epi64(a.vect_i256[1], b.vect_i256[1]);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_mulhrs_epi16(__m512i a, __m512i b)
+{
+    __m512i res;
+    res.vect_i256[0] = _mm256_mulhrs_epi16(a.vect_i256[0], b.vect_i256[0]);
+    res.vect_i256[1] = _mm256_mulhrs_epi16(a.vect_i256[1], b.vect_i256[1]);
+    return res;
+}
+
+FORCE_INLINE __m512d _mm512_mul_round_pd(__m512d a, __m512d b, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == _MM_FROUND_CUR_DIRECTION));
+    (void)rounding;
+    __m512d res_m512d;
+    res_m512d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m512d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+    res_m512d.vect_f64[2] = vmulq_f64(a.vect_f64[2], b.vect_f64[2]);
+    res_m512d.vect_f64[3] = vmulq_f64(a.vect_f64[3], b.vect_f64[3]);
+    return res_m512d;
+}
+
+FORCE_INLINE __m512 _mm512_mul_round_ps(__m512 a, __m512 b, int rounding)
+{
+    assert((rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)) || \
+           (rounding == (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)) ||
+           (rounding == _MM_FROUND_CUR_DIRECTION));
+    (void)rounding;
+    __m512 res_m512;
+    res_m512.vect_f32[0] = vmulq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m512.vect_f32[1] = vmulq_f32(a.vect_f32[1], b.vect_f32[1]);
+    res_m512.vect_f32[2] = vmulq_f32(a.vect_f32[2], b.vect_f32[2]);
+    res_m512.vect_f32[3] = vmulq_f32(a.vect_f32[3], b.vect_f32[3]);
+    return res_m512;
+}
+
+FORCE_INLINE __m512i _mm512_sll_epi64(__m512i a, __m128i count)
+{
+    int c = count.vect_s64[0];
+    __m512i result_m512i;
+    if (likely(c >= 0 && c < 64)) {
+        result_m512i.vect_s64[0] = vshlq_n_s64(a.vect_s64[0], c);
+        result_m512i.vect_s64[1] = vshlq_n_s64(a.vect_s64[1], c);
+        result_m512i.vect_s64[2] = vshlq_n_s64(a.vect_s64[2], c);
+        result_m512i.vect_s64[3] = vshlq_n_s64(a.vect_s64[3], c);
+    } else {
+        result_m512i.vect_s64[0] = vdupq_n_s64(0);
+        result_m512i.vect_s64[1] = vdupq_n_s64(0);
+        result_m512i.vect_s64[2] = vdupq_n_s64(0);
+        result_m512i.vect_s64[3] = vdupq_n_s64(0);
+    } 
+    return result_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_slli_epi64(__m512i a, unsigned int imm8)
+{
+    __m512i result_m512i;
+    if (likely(imm8 < 64)) {
+        result_m512i.vect_s64[0] = vshlq_n_s64(a.vect_s64[0], imm8);
+        result_m512i.vect_s64[1] = vshlq_n_s64(a.vect_s64[1], imm8);
+        result_m512i.vect_s64[2] = vshlq_n_s64(a.vect_s64[2], imm8);
+        result_m512i.vect_s64[3] = vshlq_n_s64(a.vect_s64[3], imm8);
+    } else {
+        result_m512i.vect_s64[0] = vdupq_n_s64(0);
+        result_m512i.vect_s64[1] = vdupq_n_s64(0);
+        result_m512i.vect_s64[2] = vdupq_n_s64(0);
+        result_m512i.vect_s64[3] = vdupq_n_s64(0);
+    }
+    return result_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_srli_epi64(__m512i a, unsigned int imm8)
+{
+    __m512i result_m512i;
+    if (likely(imm8 < 64)) {
+        int64x2_t vect_imm = vdupq_n_s64(-imm8);
+        result_m512i.vect_u64[0] = vshlq_u64(a.vect_u64[0], vect_imm);
+        result_m512i.vect_u64[1] = vshlq_u64(a.vect_u64[1], vect_imm);
+        result_m512i.vect_u64[2] = vshlq_u64(a.vect_u64[2], vect_imm);
+        result_m512i.vect_u64[3] = vshlq_u64(a.vect_u64[3], vect_imm);
+    } else {
+        result_m512i.vect_u64[0] = vdupq_n_u64(0);
+        result_m512i.vect_u64[1] = vdupq_n_u64(0);
+        result_m512i.vect_u64[2] = vdupq_n_u64(0);
+        result_m512i.vect_u64[3] = vdupq_n_u64(0);
+    } 
+    return result_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_bslli_epi128(__m512i a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 255);
+    __m512i res_m512i;
+    if (likely(imm8 > 0 && imm8 <= 15)) {
+        res_m512i.vect_s8[0] = vextq_s8(vdupq_n_s8(0), a.vect_s8[0], 16 - imm8);
+        res_m512i.vect_s8[1] = vextq_s8(vdupq_n_s8(0), a.vect_s8[1], 16 - imm8);
+        res_m512i.vect_s8[2] = vextq_s8(vdupq_n_s8(0), a.vect_s8[2], 16 - imm8);
+        res_m512i.vect_s8[3] = vextq_s8(vdupq_n_s8(0), a.vect_s8[3], 16 - imm8);
+    } else if (imm8 == 0) {
+        res_m512i = a;
+    } else {
+        res_m512i.vect_s8[0] = vdupq_n_s8(0);
+        res_m512i.vect_s8[1] = vdupq_n_s8(0);
+        res_m512i.vect_s8[2] = vdupq_n_s8(0);
+        res_m512i.vect_s8[3] = vdupq_n_s8(0);
+    }
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_bsrli_epi128(__m512i a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 255);
+    __m512i res_m512i;
+    if (likely(imm8 > 0 && imm8 <= 15)) {
+        res_m512i.vect_s8[0] = vextq_s8(a.vect_s8[0], vdupq_n_s8(0), imm8);
+        res_m512i.vect_s8[1] = vextq_s8(a.vect_s8[1], vdupq_n_s8(0), imm8);
+        res_m512i.vect_s8[2] = vextq_s8(a.vect_s8[2], vdupq_n_s8(0), imm8);
+        res_m512i.vect_s8[3] = vextq_s8(a.vect_s8[3], vdupq_n_s8(0), imm8);
+    } else if (imm8 == 0) {
+        res_m512i = a;
+    } else {
+        res_m512i.vect_s8[0] = vdupq_n_s8(0);
+        res_m512i.vect_s8[1] = vdupq_n_s8(0);
+        res_m512i.vect_s8[2] = vdupq_n_s8(0);
+        res_m512i.vect_s8[3] = vdupq_n_s8(0);
+    }
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_unpackhi_epi8(__m512i a, __m512i b)
+{
+    __m512i result_m512i;
+    result_m512i.vect_s8[0] = vzip2q_s8(a.vect_s8[0], b.vect_s8[0]);
+    result_m512i.vect_s8[1] = vzip2q_s8(a.vect_s8[1], b.vect_s8[1]);
+    result_m512i.vect_s8[2] = vzip2q_s8(a.vect_s8[2], b.vect_s8[2]);
+    result_m512i.vect_s8[3] = vzip2q_s8(a.vect_s8[3], b.vect_s8[3]);
+    return result_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_unpacklo_epi8(__m512i a, __m512i b)
+{
+    __m512i result_m512i;
+    result_m512i.vect_s8[0] = vzip1q_s8(a.vect_s8[0], b.vect_s8[0]);
+    result_m512i.vect_s8[1] = vzip1q_s8(a.vect_s8[1], b.vect_s8[1]);
+    result_m512i.vect_s8[2] = vzip1q_s8(a.vect_s8[2], b.vect_s8[2]);
+    result_m512i.vect_s8[3] = vzip1q_s8(a.vect_s8[3], b.vect_s8[3]);
+    return result_m512i;
+}
+
+FORCE_INLINE __m512d _mm512_cmp_pd(__m512d a, __m512d b, const int imm8)
+{
+    assert(imm8 < 32 && imm8 >= 0);
+    __m512d dst;
+    dst.vect_f64[0] = (float64x2_t)g_FunListCmp256Pd[imm8].cmpFun(a.vect_f64[0], b.vect_f64[0]);
+    dst.vect_f64[1] = (float64x2_t)g_FunListCmp256Pd[imm8].cmpFun(a.vect_f64[1], b.vect_f64[1]);
+    dst.vect_f64[2] = (float64x2_t)g_FunListCmp256Pd[imm8].cmpFun(a.vect_f64[2], b.vect_f64[2]);
+    dst.vect_f64[3] = (float64x2_t)g_FunListCmp256Pd[imm8].cmpFun(a.vect_f64[3], b.vect_f64[3]);
+    return dst;
+}
+
+FORCE_INLINE __mmask8 _mm512_cmp_pd_mask(__m512d a, __m512d b, const int imm8)
+{
+    assert(imm8 < 32 && imm8 >= 0);
+    __m512d dst = _mm512_cmp_pd(a, b, imm8);
+    __mmask8 res = 0;
+    uint64x2_t vect_mask = vld1q_u64(g_mask_epi64);
+    __m512i tmp;
+    uint64_t r[4];
+    __asm__ __volatile__ (
+        "and %[t0].16b, %[d0].16b, %[mask].16b        \n\t"
+        "and %[t1].16b, %[d1].16b, %[mask].16b        \n\t"
+        "and %[t2].16b, %[d2].16b, %[mask].16b        \n\t"
+        "and %[t3].16b, %[d3].16b, %[mask].16b        \n\t"
+        "addp %d[r0], %[t0].2d                        \n\t"
+        "addp %d[r1], %[t1].2d                        \n\t"
+        "addp %d[r2], %[t2].2d                        \n\t"
+        "addp %d[r3], %[t3].2d                        \n\t"
+        :[t0]"+w"(tmp.vect_u64[0]), [t1]"+w"(tmp.vect_u64[1]), [t2]"+w"(tmp.vect_u64[2]), [t3]"+w"(tmp.vect_u64[3]), 
+         [r0]"=w"(r[0]), [r1]"=w"(r[1]), [r2]"=w"(r[2]), [r3]"=w"(r[3])
+        :[d0]"w"(dst.vect_f64[0]), [d1]"w"(dst.vect_f64[1]), [d2]"w"(dst.vect_f64[2]), [d3]"w"(dst.vect_f64[3]), 
+         [mask]"w"(vect_mask)
+    );
+    res = r[0] | (r[1] << 2) | (r[2] << 4) | (r[3] << 6);
+    return res;
+}
+
+FORCE_INLINE __m512 _mm512_cmp_ps(__m512 a, __m512 b, const int imm8)
+{
+    assert(imm8 < 32 && imm8 >= 0);
+    __m512 dst;
+    dst.vect_f32[0] = vreinterpretq_f32_u32(g_FunListCmp256Ps[imm8].cmpFun(a.vect_f32[0], b.vect_f32[0]));
+    dst.vect_f32[1] = vreinterpretq_f32_u32(g_FunListCmp256Ps[imm8].cmpFun(a.vect_f32[1], b.vect_f32[1]));
+    dst.vect_f32[2] = vreinterpretq_f32_u32(g_FunListCmp256Ps[imm8].cmpFun(a.vect_f32[2], b.vect_f32[2]));
+    dst.vect_f32[3] = vreinterpretq_f32_u32(g_FunListCmp256Ps[imm8].cmpFun(a.vect_f32[3], b.vect_f32[3]));
+    return dst;
+}
+
+FORCE_INLINE __mmask16 _mm512_cmp_ps_mask(__m512 a, __m512 b, const int imm8)
+{
+    assert(imm8 < 32 && imm8 >= 0);
+    __m512 dst = _mm512_cmp_ps(a, b, imm8);
+    __mmask16 res = 0;
+    uint32x4_t vect_mask = vld1q_u32(g_mask_epi32);
+    uint32_t r0 = vaddvq_u32(vandq_u32(vreinterpretq_u32_f32(dst.vect_f32[0]), vect_mask));
+    uint32_t r1 = vaddvq_u32(vandq_u32(vreinterpretq_u32_f32(dst.vect_f32[1]), vect_mask));
+    uint32_t r2 = vaddvq_u32(vandq_u32(vreinterpretq_u32_f32(dst.vect_f32[2]), vect_mask));
+    uint32_t r3 = vaddvq_u32(vandq_u32(vreinterpretq_u32_f32(dst.vect_f32[3]), vect_mask));
+    res = r0 | (r1 << 4) | (r2 << 8) | (r3 << 12);
+    return res;
+}
+
+FORCE_INLINE __mmask16 _mm512_cmpeq_epi32_mask(__m512i a, __m512i b)
+{
+    uint32x4_t vect_mask = vld1q_u32(g_mask_epi32);
+    __m512i tmp;
+    tmp.vect_u32[0] = vandq_u32(vceqq_s32(a.vect_s32[0], b.vect_s32[0]), vect_mask);
+    tmp.vect_u32[1] = vandq_u32(vceqq_s32(a.vect_s32[1], b.vect_s32[1]), vect_mask);
+    tmp.vect_u32[2] = vandq_u32(vceqq_s32(a.vect_s32[2], b.vect_s32[2]), vect_mask);
+    tmp.vect_u32[3] = vandq_u32(vceqq_s32(a.vect_s32[3], b.vect_s32[3]), vect_mask);
+    uint32_t r0 = vaddvq_u32(tmp.vect_u32[0]);
+    uint32_t r1 = vaddvq_u32(tmp.vect_u32[1]);
+    uint32_t r2 = vaddvq_u32(tmp.vect_u32[2]);
+    uint32_t r3 = vaddvq_u32(tmp.vect_u32[3]);
+    __mmask16 result = r0 | (r1 << 4) | (r2 << 8) | (r3 << 12);
+    return result;
+}
+
+FORCE_INLINE __mmask16 _mm512_cmplt_epi32_mask(__m512i a, __m512i b)
+{
+    uint32x4_t vect_mask = vld1q_u32(g_mask_epi32);
+    __m512i tmp;
+    tmp.vect_u32[0] = vandq_u32(vcltq_s32(a.vect_s32[0], b.vect_s32[0]), vect_mask);
+    tmp.vect_u32[1] = vandq_u32(vcltq_s32(a.vect_s32[1], b.vect_s32[1]), vect_mask);
+    tmp.vect_u32[2] = vandq_u32(vcltq_s32(a.vect_s32[2], b.vect_s32[2]), vect_mask);
+    tmp.vect_u32[3] = vandq_u32(vcltq_s32(a.vect_s32[3], b.vect_s32[3]), vect_mask);
+    uint32_t r0 = vaddvq_u32(tmp.vect_u32[0]);
+    uint32_t r1 = vaddvq_u32(tmp.vect_u32[1]);
+    uint32_t r2 = vaddvq_u32(tmp.vect_u32[2]);
+    uint32_t r3 = vaddvq_u32(tmp.vect_u32[3]);
+    __mmask16 result = r0 | (r1 << 4) | (r2 << 8) | (r3 << 12);
+    return result;
+}
+
+FORCE_INLINE __mmask16 _mm512_cmple_epi32_mask(__m512i a, __m512i b)
+{
+    uint32x4_t vect_mask = vld1q_u32(g_mask_epi32);
+    __m512i tmp;
+    tmp.vect_u32[0] = vandq_u32(vcleq_s32(a.vect_s32[0], b.vect_s32[0]), vect_mask);
+    tmp.vect_u32[1] = vandq_u32(vcleq_s32(a.vect_s32[1], b.vect_s32[1]), vect_mask);
+    tmp.vect_u32[2] = vandq_u32(vcleq_s32(a.vect_s32[2], b.vect_s32[2]), vect_mask);
+    tmp.vect_u32[3] = vandq_u32(vcleq_s32(a.vect_s32[3], b.vect_s32[3]), vect_mask);
+    uint32_t r0 = vaddvq_u32(tmp.vect_u32[0]);
+    uint32_t r1 = vaddvq_u32(tmp.vect_u32[1]);
+    uint32_t r2 = vaddvq_u32(tmp.vect_u32[2]);
+    uint32_t r3 = vaddvq_u32(tmp.vect_u32[3]);
+    __mmask16 result = r0 | (r1 << 4) | (r2 << 8) | (r3 << 12);
+    return result;
+}
+
+FORCE_INLINE __mmask16 _mm512_cmpneq_epi32_mask(__m512i a, __m512i b)
+{
+    return ~_mm512_cmpeq_epi32_mask(a, b);
+}
+
+FORCE_INLINE __mmask16 _mm512_cmpnlt_epi32_mask(__m512i a, __m512i b)
+{
+    return ~_mm512_cmplt_epi32_mask(a, b);
+}
+
+FORCE_INLINE __mmask16 _mm512_cmpnle_epi32_mask(__m512i a, __m512i b)
+{
+    return ~_mm512_cmple_epi32_mask(a, b);
+}
+
+typedef __mmask16 (*TYPE_FUNC_EPI32)(__m512i a, __m512i b);
+typedef struct {
+    _MM_CMPINT_ENUM cmpintEnum;
+    TYPE_FUNC_EPI32 cmpFun;
+} FuncListEpi32;
+static FuncListEpi32 g_FunListEpi32[] = {{_MM_CMPINT_EQ, _mm512_cmpeq_epi32_mask},
+    {_MM_CMPINT_LT, _mm512_cmplt_epi32_mask},
+    {_MM_CMPINT_LE, _mm512_cmple_epi32_mask},
+    {_MM_CMPINT_FALSE, NULL},
+    {_MM_CMPINT_NE, _mm512_cmpneq_epi32_mask},
+    {_MM_CMPINT_NLT, _mm512_cmpnlt_epi32_mask},
+    {_MM_CMPINT_NLE, _mm512_cmpnle_epi32_mask},
+    {_MM_CMPINT_TRUE, NULL}};
+
+FORCE_INLINE __mmask16 _mm512_cmp_epi32_mask(__m512i a, __m512i b, const _MM_CMPINT_ENUM imm8)
+{
+    if (unlikely(imm8 == _MM_CMPINT_FALSE)) {
+        return 0;
+    }
+    if (unlikely(imm8 == _MM_CMPINT_TRUE)) {
+        return 0xffff;
+    }
+    return g_FunListEpi32[imm8].cmpFun(a, b);
+}
+
+FORCE_INLINE __mmask64 _mm512_cmpeq_epi8_mask(__m512i a, __m512i b)
+{
+    uint8x16_t vect_mask = vld1q_u8(g_mask_epi8);
+    __m512i tmp;
+    tmp.vect_u8[0] = vandq_u8(vceqq_s8(a.vect_s8[0], b.vect_s8[0]), vect_mask);
+    tmp.vect_u8[1] = vandq_u8(vceqq_s8(a.vect_s8[1], b.vect_s8[1]), vect_mask);
+    tmp.vect_u8[2] = vandq_u8(vceqq_s8(a.vect_s8[2], b.vect_s8[2]), vect_mask);
+    tmp.vect_u8[3] = vandq_u8(vceqq_s8(a.vect_s8[3], b.vect_s8[3]), vect_mask);
+    uint64_t r0 = vaddv_u8(vget_low_u8(tmp.vect_u8[0])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[0])) << 8);
+    uint64_t r1 = vaddv_u8(vget_low_u8(tmp.vect_u8[1])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[1])) << 8);
+    uint64_t r2 = vaddv_u8(vget_low_u8(tmp.vect_u8[2])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[2])) << 8);
+    uint64_t r3 = vaddv_u8(vget_low_u8(tmp.vect_u8[3])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[3])) << 8);
+    __mmask64 result = r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    return result;
+}
+
+FORCE_INLINE __mmask64 _mm512_mask_cmpeq_epi8_mask(__mmask64 k1, __m512i a, __m512i b)
+{
+    return (_mm512_cmpeq_epi8_mask(a, b) & k1);
+}
+
+FORCE_INLINE __mmask64 _mm512_cmplt_epi8_mask(__m512i a, __m512i b)
+{
+    uint8x16_t vect_mask = vld1q_u8(g_mask_epi8);
+    __m512i tmp;
+    tmp.vect_u8[0] = vandq_u8(vcltq_s8(a.vect_s8[0], b.vect_s8[0]), vect_mask);
+    tmp.vect_u8[1] = vandq_u8(vcltq_s8(a.vect_s8[1], b.vect_s8[1]), vect_mask);
+    tmp.vect_u8[2] = vandq_u8(vcltq_s8(a.vect_s8[2], b.vect_s8[2]), vect_mask);
+    tmp.vect_u8[3] = vandq_u8(vcltq_s8(a.vect_s8[3], b.vect_s8[3]), vect_mask);
+    uint64_t r0 = vaddv_u8(vget_low_u8(tmp.vect_u8[0])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[0])) << 8);
+    uint64_t r1 = vaddv_u8(vget_low_u8(tmp.vect_u8[1])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[1])) << 8);
+    uint64_t r2 = vaddv_u8(vget_low_u8(tmp.vect_u8[2])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[2])) << 8);
+    uint64_t r3 = vaddv_u8(vget_low_u8(tmp.vect_u8[3])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[3])) << 8);
+    __mmask64 result = r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    return result;
+}
+
+FORCE_INLINE __mmask64 _mm512_cmple_epi8_mask(__m512i a, __m512i b)
+{
+    uint8x16_t vect_mask = vld1q_u8(g_mask_epi8);
+    __m512i tmp;
+    tmp.vect_u8[0] = vandq_u8(vcleq_s8(a.vect_s8[0], b.vect_s8[0]), vect_mask);
+    tmp.vect_u8[1] = vandq_u8(vcleq_s8(a.vect_s8[1], b.vect_s8[1]), vect_mask);
+    tmp.vect_u8[2] = vandq_u8(vcleq_s8(a.vect_s8[2], b.vect_s8[2]), vect_mask);
+    tmp.vect_u8[3] = vandq_u8(vcleq_s8(a.vect_s8[3], b.vect_s8[3]), vect_mask);
+    uint64_t r0 = vaddv_u8(vget_low_u8(tmp.vect_u8[0])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[0])) << 8);
+    uint64_t r1 = vaddv_u8(vget_low_u8(tmp.vect_u8[1])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[1])) << 8);
+    uint64_t r2 = vaddv_u8(vget_low_u8(tmp.vect_u8[2])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[2])) << 8);
+    uint64_t r3 = vaddv_u8(vget_low_u8(tmp.vect_u8[3])) | (vaddv_u8(vget_high_u8(tmp.vect_u8[3])) << 8);
+    __mmask64 result = r0 | (r1 << 16) | (r2 << 32) | (r3 << 48);
+    return result;
+}
+
+FORCE_INLINE __mmask64 _mm512_cmpneq_epi8_mask(__m512i a, __m512i b)
+{
+    return ~_mm512_cmpeq_epi8_mask(a, b);
+}
+
+FORCE_INLINE __mmask64 _mm512_cmpnlt_epi8_mask(__m512i a, __m512i b)
+{
+    return ~_mm512_cmplt_epi8_mask(a, b);
+}
+
+FORCE_INLINE __mmask64 _mm512_cmpnle_epi8_mask(__m512i a, __m512i b)
+{
+    return ~_mm512_cmple_epi8_mask(a, b);
+}
+
+typedef __mmask64 (*TYPE_FUNC_EPI8)(__m512i a, __m512i b);
+typedef struct {
+    _MM_CMPINT_ENUM cmpintEnum;
+    TYPE_FUNC_EPI8 cmpFun;
+} FuncListEpi8;
+
+static FuncListEpi8 g_FunListEpi8[] = {{_MM_CMPINT_EQ, _mm512_cmpeq_epi8_mask},
+    {_MM_CMPINT_LT, _mm512_cmplt_epi8_mask},
+    {_MM_CMPINT_LE, _mm512_cmple_epi8_mask},
+    {_MM_CMPINT_FALSE, NULL},
+    {_MM_CMPINT_NE, _mm512_cmpneq_epi8_mask},
+    {_MM_CMPINT_NLT, _mm512_cmpnlt_epi8_mask},
+    {_MM_CMPINT_NLE, _mm512_cmpnle_epi8_mask},
+    {_MM_CMPINT_TRUE, NULL}};
+
+FORCE_INLINE __mmask64 _mm512_cmp_epi8_mask(__m512i a, __m512i b, const int imm8)
+{
+    if (unlikely(imm8 == _MM_CMPINT_FALSE)) {
+        return 0;
+    }
+    if (unlikely(imm8 == _MM_CMPINT_TRUE)) {
+        return 0xffffffffffffffff;
+    }
+    return g_FunListEpi8[imm8].cmpFun(a, b);
+}
+
+FORCE_INLINE __m512i _mm512_and_si512(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s32[0] = vandq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m512i.vect_s32[1] = vandq_s32(a.vect_s32[1], b.vect_s32[1]);
+    res_m512i.vect_s32[2] = vandq_s32(a.vect_s32[2], b.vect_s32[2]);
+    res_m512i.vect_s32[3] = vandq_s32(a.vect_s32[3], b.vect_s32[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_or_si512(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s32[0] = vorrq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m512i.vect_s32[1] = vorrq_s32(a.vect_s32[1], b.vect_s32[1]);
+    res_m512i.vect_s32[2] = vorrq_s32(a.vect_s32[2], b.vect_s32[2]);
+    res_m512i.vect_s32[3] = vorrq_s32(a.vect_s32[3], b.vect_s32[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_andnot_si512(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s32[0] = vbicq_s32(b.vect_s32[0], a.vect_s32[0]);
+    res_m512i.vect_s32[1] = vbicq_s32(b.vect_s32[1], a.vect_s32[1]);
+    res_m512i.vect_s32[2] = vbicq_s32(b.vect_s32[2], a.vect_s32[2]);
+    res_m512i.vect_s32[3] = vbicq_s32(b.vect_s32[3], a.vect_s32[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_xor_si512(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s32[0] = veorq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m512i.vect_s32[1] = veorq_s32(a.vect_s32[1], b.vect_s32[1]);
+    res_m512i.vect_s32[2] = veorq_s32(a.vect_s32[2], b.vect_s32[2]);
+    res_m512i.vect_s32[3] = veorq_s32(a.vect_s32[3], b.vect_s32[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_and_epi32 (__m512i a, __m512i b)
+{
+    a.vect_s32[0] = vandq_s32(a.vect_s32[0], b.vect_s32[0]);
+    a.vect_s32[1] = vandq_s32(a.vect_s32[1], b.vect_s32[1]);
+    a.vect_s32[2] = vandq_s32(a.vect_s32[2], b.vect_s32[2]);
+    a.vect_s32[3] = vandq_s32(a.vect_s32[3], b.vect_s32[3]);
+    return a;
+}
+
+FORCE_INLINE __m512i _mm512_and_epi64 (__m512i a, __m512i b)
+{
+    a.vect_s64[0] = vandq_s64(a.vect_s64[0], b.vect_s64[0]);
+    a.vect_s64[1] = vandq_s64(a.vect_s64[1], b.vect_s64[1]);
+    a.vect_s64[2] = vandq_s64(a.vect_s64[2], b.vect_s64[2]);
+    a.vect_s64[3] = vandq_s64(a.vect_s64[3], b.vect_s64[3]);
+    return a;
+}
+
+FORCE_INLINE __m512i _mm512_or_epi32 (__m512i a, __m512i b)
+{
+    a.vect_s32[0] = vorrq_s32(a.vect_s32[0], b.vect_s32[0]);
+    a.vect_s32[1] = vorrq_s32(a.vect_s32[1], b.vect_s32[1]);
+    a.vect_s32[2] = vorrq_s32(a.vect_s32[2], b.vect_s32[2]);
+    a.vect_s32[3] = vorrq_s32(a.vect_s32[3], b.vect_s32[3]);
+    return a;
+}
+
+FORCE_INLINE __m512i _mm512_or_epi64 (__m512i a, __m512i b)
+{
+    a.vect_s64[0] = vorrq_s64(a.vect_s64[0], b.vect_s64[0]);
+    a.vect_s64[1] = vorrq_s64(a.vect_s64[1], b.vect_s64[1]);
+    a.vect_s64[2] = vorrq_s64(a.vect_s64[2], b.vect_s64[2]);
+    a.vect_s64[3] = vorrq_s64(a.vect_s64[3], b.vect_s64[3]);
+    return a;
+}
+
+FORCE_INLINE __m512 _mm512_xor_ps (__m512 a, __m512 b)
+{
+    __asm__ __volatile(
+        "eor %0.16b, %0.16b, %2.16b     \n\t"
+        "eor %1.16b, %1.16b, %3.16b     \n\t"
+        :"+w"(a.vect_f32[0]), "+w"(a.vect_f32[1])
+        :"w"(b.vect_f32[0]), "w"(b.vect_f32[1])
+    );
+    __asm__ __volatile(
+        "eor %0.16b, %0.16b, %2.16b     \n\t"
+        "eor %1.16b, %1.16b, %3.16b     \n\t"
+        :"+w"(a.vect_f32[2]), "+w"(a.vect_f32[3])
+        :"w"(b.vect_f32[2]), "w"(b.vect_f32[3])
+    );
+    return a;
+}
+
+FORCE_INLINE __m512d _mm512_xor_pd (__m512d a, __m512d b)
+{
+    __asm__ __volatile(
+        "eor %0.16b, %0.16b, %2.16b     \n\t"
+        "eor %1.16b, %1.16b, %3.16b     \n\t"
+        :"+w"(a.vect_f64[0]), "+w"(a.vect_f64[1])
+        :"w"(b.vect_f64[0]), "w"(b.vect_f64[1])
+    );
+    __asm__ __volatile(
+        "eor %0.16b, %0.16b, %2.16b     \n\t"
+        "eor %1.16b, %1.16b, %3.16b     \n\t"
+        :"+w"(a.vect_f64[2]), "+w"(a.vect_f64[3])
+        :"w"(b.vect_f64[2]), "w"(b.vect_f64[3])
+    );
+    return a;
+}
+
+FORCE_INLINE __m512i _mm512_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7,
+    int e6, int e5, int e4, int e3, int e2, int e1, int e0)
+{
+    __m512i res_m512i;
+    SET32x4(res_m512i.vect_s32[0], e0, e1, e2, e3);
+    SET32x4(res_m512i.vect_s32[1], e4, e5, e6, e7);
+    SET32x4(res_m512i.vect_s32[2], e8, e9, e10, e11);
+    SET32x4(res_m512i.vect_s32[3], e12, e13, e14, e15);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_set_epi64(
+    __int64 e7, __int64 e6, __int64 e5, __int64 e4, __int64 e3, __int64 e2, __int64 e1, __int64 e0)
+{
+    __m512i res_m512i;
+    SET64x2(res_m512i.vect_s64[0], e0, e1);
+    SET64x2(res_m512i.vect_s64[1], e2, e3);
+    SET64x2(res_m512i.vect_s64[2], e4, e5);
+    SET64x2(res_m512i.vect_s64[3], e6, e7);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_set1_epi32(int a)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s32[0] = vdupq_n_s32(a);
+    res_m512i.vect_s32[1] = vdupq_n_s32(a);
+    res_m512i.vect_s32[2] = vdupq_n_s32(a);
+    res_m512i.vect_s32[3] = vdupq_n_s32(a);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_set1_epi64(__int64 a)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s64[0] = vdupq_n_s64(a);
+    res_m512i.vect_s64[1] = vdupq_n_s64(a);
+    res_m512i.vect_s64[2] = vdupq_n_s64(a);
+    res_m512i.vect_s64[3] = vdupq_n_s64(a);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_set1_epi8(char a)
+{
+    __m512i res_m512i;
+    res_m512i.vect_s8[0] = vdupq_n_s8(a);
+    res_m512i.vect_s8[1] = vdupq_n_s8(a);
+    res_m512i.vect_s8[2] = vdupq_n_s8(a);
+    res_m512i.vect_s8[3] = vdupq_n_s8(a);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512 _mm512_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8,
+    float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
+{
+    __m512 res_m512;
+    SET32x4(res_m512.vect_f32[0], e0, e1, e2, e3);
+    SET32x4(res_m512.vect_f32[1], e4, e5, e6, e7);
+    SET32x4(res_m512.vect_f32[2], e8, e9, e10, e11);
+    SET32x4(res_m512.vect_f32[3], e12, e13, e14, e15);
+    return res_m512;
+}
+
+FORCE_INLINE __m512d _mm512_set_pd(
+    double e7, double e6, double e5, double e4, double e3, double e2, double e1, double e0)
+{
+    __m512d res_m512d;
+    SET64x2(res_m512d.vect_f64[0], e0, e1);
+    SET64x2(res_m512d.vect_f64[1], e2, e3);
+    SET64x2(res_m512d.vect_f64[2], e4, e5);
+    SET64x2(res_m512d.vect_f64[3], e6, e7);
+    return res_m512d;
+}
+
+FORCE_INLINE __m512 _mm512_set1_ps(float a)
+{
+    __m512 res_m512;
+    res_m512.vect_f32[0] = vdupq_n_f32(a);
+    res_m512.vect_f32[1] = vdupq_n_f32(a);
+    res_m512.vect_f32[2] = vdupq_n_f32(a);
+    res_m512.vect_f32[3] = vdupq_n_f32(a);
+    return res_m512;
+}
+
+FORCE_INLINE __m512d _mm512_set1_pd(double a)
+{
+    __m512d res_m512d;
+    res_m512d.vect_f64[0] = vdupq_n_f64(a);
+    res_m512d.vect_f64[1] = vdupq_n_f64(a);
+    res_m512d.vect_f64[2] = vdupq_n_f64(a);
+    res_m512d.vect_f64[3] = vdupq_n_f64(a);
+    return res_m512d;
+}
+
+FORCE_INLINE __m512 _mm512_setzero_ps()
+{
+    __m512 res_m512;
+    res_m512.vect_f32[0] = vdupq_n_f32(0.0);
+    res_m512.vect_f32[1] = vdupq_n_f32(0.0);
+    res_m512.vect_f32[2] = vdupq_n_f32(0.0);
+    res_m512.vect_f32[3] = vdupq_n_f32(0.0);
+    return res_m512;
+}
+
+FORCE_INLINE __m512d _mm512_setzero_pd()
+{
+    __m512d res_m512d;
+    res_m512d.vect_f64[0] = vdupq_n_f64(0.0);
+    res_m512d.vect_f64[1] = vdupq_n_f64(0.0);
+    res_m512d.vect_f64[2] = vdupq_n_f64(0.0);
+    res_m512d.vect_f64[3] = vdupq_n_f64(0.0);
+    return res_m512d;
+}
+
+FORCE_INLINE __m512i _mm512_movm_epi8(__mmask64 k)
+{
+    uint8x8_t mk = vcreate_u8(k);
+    uint8x16_t mask_and = vld1q_u8(g_mask_epi8);
+
+    __m512i res_m512i;
+    res_m512i.vect_u8[0] = vcombine_u8(vdup_lane_u8(mk, 0), vdup_lane_u8(mk, 1));
+    res_m512i.vect_u8[1] = vcombine_u8(vdup_lane_u8(mk, 2), vdup_lane_u8(mk, 3));
+    res_m512i.vect_u8[2] = vcombine_u8(vdup_lane_u8(mk, 4), vdup_lane_u8(mk, 5));
+    res_m512i.vect_u8[3] = vcombine_u8(vdup_lane_u8(mk, 6), vdup_lane_u8(mk, 7));
+    res_m512i.vect_u8[0] = vtstq_u8(mask_and, res_m512i.vect_u8[0]);
+    res_m512i.vect_u8[1] = vtstq_u8(mask_and, res_m512i.vect_u8[1]);
+    res_m512i.vect_u8[2] = vtstq_u8(mask_and, res_m512i.vect_u8[2]);
+    res_m512i.vect_u8[3] = vtstq_u8(mask_and, res_m512i.vect_u8[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m128i _mm512_extracti32x4_epi32(__m512i a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 3);
+    __m128i res_m128i;
+    res_m128i.vect_s32 = a.vect_s32[imm8];
+    return res_m128i;
+}
+
+FORCE_INLINE __m256 _mm512_extractf32x8_ps (__m512 a, int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    __m256 res_m256;
+    int id = imm8 << 1;
+    res_m256.vect_f32[0] = a.vect_f32[id];
+    res_m256.vect_f32[1] = a.vect_f32[id | 1];
+    return res_m256;
+}
+
+FORCE_INLINE __m256d _mm512_extractf64x4_pd (__m512d a, int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    __m256d res_m256d;
+    int id = imm8 << 1;
+    res_m256d.vect_f64[0] = a.vect_f64[id];
+    res_m256d.vect_f64[1] = a.vect_f64[id | 1];
+    return res_m256d;
+}
+
+FORCE_INLINE void _mm512_store_si512(void* mem_addr, __m512i a)
+{
+    vst1q_s64((int64_t*)mem_addr, a.vect_s64[0]);
+    vst1q_s64((int64_t*)mem_addr + 2, a.vect_s64[1]);
+    vst1q_s64((int64_t*)mem_addr + 4, a.vect_s64[2]);
+    vst1q_s64((int64_t*)mem_addr + 6, a.vect_s64[3]);
+}
+
+FORCE_INLINE __m512i _mm512_load_si512(void const* mem_addr)
+{
+    __m512i ret;
+    ret.vect_s32[0] = vld1q_s32((int32_t const*)mem_addr);
+    ret.vect_s32[1] = vld1q_s32(((int32_t const*)mem_addr) + 4);
+    ret.vect_s32[2] = vld1q_s32(((int32_t const*)mem_addr) + 8);
+    ret.vect_s32[3] = vld1q_s32(((int32_t const*)mem_addr) + 12);
+    return ret;
+}
+
+FORCE_INLINE __m512i _mm512_loadu_si512(void const* mem_addr)
+{
+    __m512i ret;
+    ret.vect_s32[0] = vld1q_s32((int32_t const*)mem_addr);
+    ret.vect_s32[1] = vld1q_s32(((int32_t const*)mem_addr) + 4);
+    ret.vect_s32[2] = vld1q_s32(((int32_t const*)mem_addr) + 8);
+    ret.vect_s32[3] = vld1q_s32(((int32_t const*)mem_addr) + 12);
+    return ret;
+}
+
+FORCE_INLINE __m512i _mm512_mask_loadu_epi8(__m512i src, __mmask64 k, void const* mem_addr)
+{
+    __m512i ret;
+    int8_t const* data_addr = (int8_t const*)mem_addr;
+    uint8x16_t mask = vld1q_u8(g_mask_epi8);
+    uint8x16_t k_vec[4];
+    k_vec[0] = vcombine_u8(vdup_n_u8(k & 0xff), vdup_n_u8((k >> 8) & 0xff));
+    k_vec[1] = vcombine_u8(vdup_n_u8((k >> 16) & 0xff), vdup_n_u8((k >> 24) & 0xff));
+    k_vec[2] = vcombine_u8(vdup_n_u8((k >> 32) & 0xff), vdup_n_u8((k >> 40) & 0xff));
+    k_vec[3] = vcombine_u8(vdup_n_u8((k >> 48) & 0xff), vdup_n_u8((k >> 56) & 0xff));
+    ret.vect_s8[0] = vbslq_s8(vtstq_u8(k_vec[0], mask), vld1q_s8(data_addr), src.vect_s8[0]);
+    ret.vect_s8[1] = vbslq_s8(vtstq_u8(k_vec[1], mask), vld1q_s8(data_addr + 16), src.vect_s8[1]);
+    ret.vect_s8[2] = vbslq_s8(vtstq_u8(k_vec[2], mask), vld1q_s8(data_addr + 32), src.vect_s8[2]);
+    ret.vect_s8[3] = vbslq_s8(vtstq_u8(k_vec[3], mask), vld1q_s8(data_addr + 48), src.vect_s8[3]);
+    return ret;
+}
+
+FORCE_INLINE __m512i _mm512_maskz_loadu_epi8(__mmask64 k, void const* mem_addr)
+{
+    __m512i ret;
+    uint8_t const* data_addr = (uint8_t const*)mem_addr;
+    uint8x16_t mask = vld1q_u8(g_mask_epi8);
+    uint8x16_t k_vec[4];
+    k_vec[0] = vcombine_u8(vdup_n_u8(k & 0xff), vdup_n_u8((k >> 8) & 0xff));
+    k_vec[1] = vcombine_u8(vdup_n_u8((k >> 16) & 0xff), vdup_n_u8((k >> 24) & 0xff));
+    k_vec[2] = vcombine_u8(vdup_n_u8((k >> 32) & 0xff), vdup_n_u8((k >> 40) & 0xff));
+    k_vec[3] = vcombine_u8(vdup_n_u8((k >> 48) & 0xff), vdup_n_u8((k >> 56) & 0xff));
+    ret.vect_u8[0] = vandq_u8(vtstq_u8(k_vec[0], mask), vld1q_u8(data_addr));
+    ret.vect_u8[1] = vandq_u8(vtstq_u8(k_vec[1], mask), vld1q_u8(data_addr + 16));
+    ret.vect_u8[2] = vandq_u8(vtstq_u8(k_vec[2], mask), vld1q_u8(data_addr + 32));
+    ret.vect_u8[3] = vandq_u8(vtstq_u8(k_vec[3], mask), vld1q_u8(data_addr + 48));
+    return ret;
+}
+
+FORCE_INLINE __m512i _mm512_abs_epi8(__m512i a)
+{
+    __m512i ret;
+    ret.vect_s8[0] = vabsq_s8(a.vect_s8[0]);
+    ret.vect_s8[1] = vabsq_s8(a.vect_s8[1]);
+    ret.vect_s8[2] = vabsq_s8(a.vect_s8[2]);
+    ret.vect_s8[3] = vabsq_s8(a.vect_s8[3]);
+    return ret;
+}
+
+FORCE_INLINE __m512i _mm512_broadcast_i32x4(__m128i a)
+{
+    __m512i ret;
+    ret.vect_s32[0] = a.vect_s32;
+    ret.vect_s32[1] = a.vect_s32;
+    ret.vect_s32[2] = a.vect_s32;
+    ret.vect_s32[3] = a.vect_s32;
+    return ret;
+}
+
+FORCE_INLINE __m512i _mm512_broadcast_i64x4(__m256i a)
+{
+    __m512i ret;
+    ret.vect_s64[0] = a.vect_s64[0];
+    ret.vect_s64[1] = a.vect_s64[1];
+    ret.vect_s64[2] = a.vect_s64[0];
+    ret.vect_s64[3] = a.vect_s64[1];
+    return ret;
+}
+
+FORCE_INLINE __m512i _mm512_mask_broadcast_i64x4(__m512i src, __mmask8 k, __m256i a)
+{
+    __m512i ret;
+    uint64x2_t vect_mask = vld1q_u64(g_mask_epi64);
+    uint64x2_t tmp[4];
+    tmp[0] = vtstq_u64(vdupq_n_u64(k & 0x03), vect_mask);
+    tmp[1] = vtstq_u64(vdupq_n_u64((k & 0x0c) >> 2), vect_mask);
+    tmp[2] = vtstq_u64(vdupq_n_u64((k & 0x30) >> 4), vect_mask);
+    tmp[3] = vtstq_u64(vdupq_n_u64((k & 0xc0) >> 6), vect_mask);
+    ret.vect_s64[0] = vbslq_s64(tmp[0], a.vect_s64[0], src.vect_s64[0]);
+    ret.vect_s64[1] = vbslq_s64(tmp[1], a.vect_s64[1], src.vect_s64[1]);
+    ret.vect_s64[2] = vbslq_s64(tmp[2], a.vect_s64[0], src.vect_s64[2]);
+    ret.vect_s64[3] = vbslq_s64(tmp[3], a.vect_s64[1], src.vect_s64[3]);
+    return ret;
+}
+
+FORCE_INLINE __m512i _mm512_shuffle_epi8(__m512i a, __m512i b)
+{
+    __m512i res_m512i;
+    uint8x16_t mask_and = vdupq_n_u8(0x8f);
+    res_m512i.vect_u8[0] = vqtbl1q_u8(a.vect_u8[0], vandq_u8(b.vect_u8[0], mask_and));
+    res_m512i.vect_u8[1] = vqtbl1q_u8(a.vect_u8[1], vandq_u8(b.vect_u8[1], mask_and));
+    res_m512i.vect_u8[2] = vqtbl1q_u8(a.vect_u8[2], vandq_u8(b.vect_u8[2], mask_and));
+    res_m512i.vect_u8[3] = vqtbl1q_u8(a.vect_u8[3], vandq_u8(b.vect_u8[3], mask_and));
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_maskz_shuffle_epi8(__mmask64 k, __m512i a, __m512i b)
+{
+    uint8x8_t mk = vcreate_u8(k);
+    uint8x16_t mask_and = vld1q_u8(g_mask_epi8);
+
+    __m512i tmp, res_m512i;
+    tmp.vect_u8[0] = vtstq_u8(mask_and, vcombine_u8(vdup_lane_u8(mk, 0), vdup_lane_u8(mk, 1)));
+    tmp.vect_u8[1] = vtstq_u8(mask_and, vcombine_u8(vdup_lane_u8(mk, 2), vdup_lane_u8(mk, 3)));
+    tmp.vect_u8[2] = vtstq_u8(mask_and, vcombine_u8(vdup_lane_u8(mk, 4), vdup_lane_u8(mk, 5)));
+    tmp.vect_u8[3] = vtstq_u8(mask_and, vcombine_u8(vdup_lane_u8(mk, 6), vdup_lane_u8(mk, 7)));
+    mask_and = vdupq_n_u8(0x8f);
+    res_m512i.vect_u8[0] = vqtbl1q_u8(a.vect_u8[0], vandq_u8(b.vect_u8[0], mask_and));
+    res_m512i.vect_u8[1] = vqtbl1q_u8(a.vect_u8[1], vandq_u8(b.vect_u8[1], mask_and));
+    res_m512i.vect_u8[2] = vqtbl1q_u8(a.vect_u8[2], vandq_u8(b.vect_u8[2], mask_and));
+    res_m512i.vect_u8[3] = vqtbl1q_u8(a.vect_u8[3], vandq_u8(b.vect_u8[3], mask_and));
+    res_m512i.vect_u8[0] = vminq_u8(res_m512i.vect_u8[0], tmp.vect_u8[0]);
+    res_m512i.vect_u8[1] = vminq_u8(res_m512i.vect_u8[1], tmp.vect_u8[1]);
+    res_m512i.vect_u8[2] = vminq_u8(res_m512i.vect_u8[2], tmp.vect_u8[2]);
+    res_m512i.vect_u8[3] = vminq_u8(res_m512i.vect_u8[3], tmp.vect_u8[3]);
+    return res_m512i;
+}
+
+FORCE_INLINE __m512i _mm512_multishift_epi64_epi8(__m512i a, __m512i b)
+{
+    __m512i res;
+    res.vect_i256[0] = _mm256_multishift_epi64_epi8(a.vect_i256[0], b.vect_i256[0]);
+    res.vect_i256[1] = _mm256_multishift_epi64_epi8(a.vect_i256[1], b.vect_i256[1]);
+    return res;
+}
+
+FORCE_INLINE __m512 _mm512_mask_blend_ps(__mmask16 k, __m512 a, __m512 b)
+{
+    __m512 result_m512;
+    uint32x4_t vect_mask = vld1q_u32(g_mask_epi32);
+    uint32x4_t vect_imm = vdupq_n_u32(k);
+    uint32x4_t flag[4];
+    flag[0] = vtstq_u32(vect_imm, vect_mask);
+    flag[1] = vtstq_u32(vshrq_n_u32(vect_imm, 4), vect_mask);
+    flag[2] = vtstq_u32(vshrq_n_u32(vect_imm, 8), vect_mask);
+    flag[3] = vtstq_u32(vshrq_n_u32(vect_imm, 12), vect_mask);
+    result_m512.vect_f32[0] = vbslq_f32(flag[0], b.vect_f32[0], a.vect_f32[0]);
+    result_m512.vect_f32[1] = vbslq_f32(flag[1], b.vect_f32[1], a.vect_f32[1]);
+    result_m512.vect_f32[2] = vbslq_f32(flag[2], b.vect_f32[2], a.vect_f32[2]);
+    result_m512.vect_f32[3] = vbslq_f32(flag[3], b.vect_f32[3], a.vect_f32[3]);
+    return result_m512;
+}
+
+FORCE_INLINE __m512d _mm512_mask_blend_pd(__mmask8 k, __m512d a, __m512d b)
+{
+    __m512d result_m512d;
+    uint64x2_t vect_mask = vld1q_u64(g_mask_epi64);
+    uint64x2_t vect_imm = vdupq_n_u64(k);
+    uint64x2_t flag[4];
+    flag[0] = vtstq_u64(vect_imm, vect_mask);
+    flag[1] = vtstq_u64(vshrq_n_u64(vect_imm, 2), vect_mask);
+    flag[2] = vtstq_u64(vshrq_n_u64(vect_imm, 4), vect_mask);
+    flag[3] = vtstq_u64(vshrq_n_u64(vect_imm, 6), vect_mask);
+    result_m512d.vect_f64[0] = vbslq_f64(flag[0], b.vect_f64[0], a.vect_f64[0]);
+    result_m512d.vect_f64[1] = vbslq_f64(flag[1], b.vect_f64[1], a.vect_f64[1]);
+    result_m512d.vect_f64[2] = vbslq_f64(flag[2], b.vect_f64[2], a.vect_f64[2]);  
+    result_m512d.vect_f64[3] = vbslq_f64(flag[3], b.vect_f64[3], a.vect_f64[3]);
+    return result_m512d;
+}
+
+FORCE_INLINE __m512d _mm512_castpd128_pd512 (__m128d a)
+{
+    __m512d ret;
+    ret.vect_f64[0] = a;
+    return ret;
+}
+
+FORCE_INLINE __m128d _mm512_castpd512_pd128 (__m512d a)
+{
+    return a.vect_f64[0];
+}
+
+FORCE_INLINE __m512 _mm512_castps128_ps512 (__m128 a)
+{
+    __m512 ret;
+    ret.vect_f32[0] = a;
+    return ret;
+}
+
+FORCE_INLINE __m128 _mm512_castps512_ps128 (__m512 a)
+{
+    return a.vect_f32[0];
+}
+
+FORCE_INLINE __m512 _mm512_cvtepi32_ps (__m512i a)
+{
+    __m512 ret;
+    ret.vect_f32[0] = vcvtq_f32_s32(a.vect_s32[0]);
+    ret.vect_f32[1] = vcvtq_f32_s32(a.vect_s32[1]);
+    ret.vect_f32[2] = vcvtq_f32_s32(a.vect_s32[2]);
+    ret.vect_f32[3] = vcvtq_f32_s32(a.vect_s32[3]);
+    return ret;
+}
+
+FORCE_INLINE __m512d _mm512_cvtepi32_pd (__m256i a)
+{
+    __m512d res;
+    __asm__ __volatile__ (
+        "scvtf v0.4s, %[a0].4s           \n\t"
+        "scvtf v1.4s, %[a1].4s           \n\t"
+        "fcvtl %[r0].2d, v0.2s           \n\t"
+        "fcvtl %[r2].2d, v1.2s           \n\t"
+        "mov v0.d[0], v0.d[1]            \n\t"
+        "mov v1.d[0], v1.d[1]            \n\t"
+        "fcvtl %[r1].2d, v0.2s           \n\t"
+        "fcvtl %[r3].2d, v1.2s           \n\t"
+        :[r0]"=w"(res.vect_f64[0]), [r1]"=w"(res.vect_f64[1]), [r2]"=w"(res.vect_f64[2]), [r3]"=w"(res.vect_f64[3])
+        :[a0]"w"(a.vect_s32[0]), [a1]"w"(a.vect_s32[1])
+        :"v0", "v1"
+    );
+    return res;
+}
+
+FORCE_INLINE __m512 _mm512_insertf32x8 (__m512 a, __m256 b, int imm8)
+{
+    assert(imm8 == 0 || imm8 == 1);
+    __m512 res;
+    uint32x4_t vmask = vceqq_s32(vdupq_n_s32(imm8), vdupq_n_s32(0));
+    res.vect_f32[0] = vbslq_f32(vmask, b.vect_f32[0], a.vect_f32[0]);
+    res.vect_f32[1] = vbslq_f32(vmask, b.vect_f32[1], a.vect_f32[1]);
+    res.vect_f32[2] = vbslq_f32(vmask, a.vect_f32[2], b.vect_f32[0]);
+    res.vect_f32[3] = vbslq_f32(vmask, a.vect_f32[3], b.vect_f32[1]);
+    return res;
+}
+
+FORCE_INLINE __m512d _mm512_insertf64x4 (__m512d a, __m256d b, int imm8)
+{
+    assert(imm8 == 0 || imm8 == 1);
+    __m512d res;
+    uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0));
+    res.vect_f64[0] = vbslq_f64(vmask, b.vect_f64[0], a.vect_f64[0]);
+    res.vect_f64[1] = vbslq_f64(vmask, b.vect_f64[1], a.vect_f64[1]);
+    res.vect_f64[2] = vbslq_f64(vmask, a.vect_f64[2], b.vect_f64[0]);
+    res.vect_f64[3] = vbslq_f64(vmask, a.vect_f64[3], b.vect_f64[1]);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_inserti32x8 (__m512i a, __m256i b, int imm8)
+{
+    assert(imm8 == 0 || imm8 == 1);
+    __m512i res;
+    uint32x4_t vmask = vceqq_s32(vdupq_n_s32(imm8), vdupq_n_s32(0));
+    res.vect_s32[0] = vbslq_s32(vmask, b.vect_s32[0], a.vect_s32[0]);
+    res.vect_s32[1] = vbslq_s32(vmask, b.vect_s32[1], a.vect_s32[1]);
+    res.vect_s32[2] = vbslq_s32(vmask, a.vect_s32[2], b.vect_s32[0]);
+    res.vect_s32[3] = vbslq_s32(vmask, a.vect_s32[3], b.vect_s32[1]);
+    return res;
+}
+
+FORCE_INLINE __m512i _mm512_inserti64x4 (__m512i a, __m256i b, int imm8)
+{
+    assert(imm8 == 0 || imm8 == 1);
+    __m512i res;
+    uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0));
+    res.vect_s64[0] = vbslq_s64(vmask, b.vect_s64[0], a.vect_s64[0]);
+    res.vect_s64[1] = vbslq_s64(vmask, b.vect_s64[1], a.vect_s64[1]);
+    res.vect_s64[2] = vbslq_s64(vmask, a.vect_s64[2], b.vect_s64[0]);
+    res.vect_s64[3] = vbslq_s64(vmask, a.vect_s64[3], b.vect_s64[1]);
+    return res;
+}
diff --git a/avxintrin.h b/avxintrin.h
new file mode 100644
index 0000000..f57ae47
--- /dev/null
+++ b/avxintrin.h
@@ -0,0 +1,2158 @@
+/*
+ * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ */
+
+#ifndef AVX2NEON_H
+#error Never use <avxintrin.h> directly; include " avx2neon.h" instead.
+#endif
+
+
+#include <arm_neon.h>
+
+#include <math.h>
+#ifdef __cplusplus
+using namespace std;
+#endif
+
+#include "typedefs.h"
+
+typedef union {
+    int8x16_t vect_s8[2];
+    int16x8_t vect_s16[2];
+    int32x4_t vect_s32[2];
+    int64x2_t vect_s64[2];
+    uint8x16_t vect_u8[2];
+    uint16x8_t vect_u16[2];
+    uint32x4_t vect_u32[2];
+    uint64x2_t vect_u64[2];
+    __m128i vect_i128[2];
+} __m256i __attribute__((aligned(32)));
+
+typedef struct {
+    float32x4_t vect_f32[2];
+} __m256;
+
+typedef struct {
+    float64x2_t vect_f64[2];
+} __m256d;
+
+#define _CMP_EQ_OQ 0x00
+#define _CMP_LT_OS 0x01
+#define _CMP_LE_OS 0x02
+#define _CMP_UNORD_Q 0x03
+#define _CMP_NEQ_UQ 0x04
+#define _CMP_NLT_US 0x05
+#define _CMP_NLE_US 0x06
+#define _CMP_ORD_Q 0x07
+#define _CMP_EQ_UQ 0x08
+#define _CMP_NGE_US 0x09
+#define _CMP_NGT_US 0x0a
+#define _CMP_FALSE_OQ 0x0b
+#define _CMP_NEQ_OQ 0x0c
+#define _CMP_GE_OS 0x0d
+#define _CMP_GT_OS 0x0e
+#define _CMP_TRUE_UQ 0x0f
+#define _CMP_EQ_OS 0x10
+#define _CMP_LT_OQ 0x11
+#define _CMP_LE_OQ 0x12
+#define _CMP_UNORD_S 0x13
+#define _CMP_NEQ_US 0x14
+#define _CMP_NLT_UQ 0x15
+#define _CMP_NLE_UQ 0x16
+#define _CMP_ORD_S 0x17
+#define _CMP_EQ_US 0x18
+#define _CMP_NGE_UQ 0x19
+#define _CMP_NGT_UQ 0x1a
+#define _CMP_FALSE_OS 0x1b
+#define _CMP_NEQ_OS 0x1c
+#define _CMP_GE_OQ 0x1d
+#define _CMP_GT_OQ 0x1e
+#define _CMP_TRUE_US 0x1f
+
+FORCE_INLINE void _mm256_convert_to_int32(int32_t* ptr_a, __m256i a)
+{
+    ptr_a[0] = vgetq_lane_s32(a.vect_s32[0], 0);
+    ptr_a[1] = vgetq_lane_s32(a.vect_s32[0], 1);
+    ptr_a[2] = vgetq_lane_s32(a.vect_s32[0], 2);
+    ptr_a[3] = vgetq_lane_s32(a.vect_s32[0], 3);
+    ptr_a[4] = vgetq_lane_s32(a.vect_s32[1], 0);
+    ptr_a[5] = vgetq_lane_s32(a.vect_s32[1], 1);
+    ptr_a[6] = vgetq_lane_s32(a.vect_s32[1], 2);
+    ptr_a[7] = vgetq_lane_s32(a.vect_s32[1], 3);
+}
+
+FORCE_INLINE void _mm256_convert_to_int64(int64_t* ptr_a, __m256i a)
+{
+    ptr_a[0] = vgetq_lane_s64(a.vect_s64[0], 0);
+    ptr_a[1] = vgetq_lane_s64(a.vect_s64[0], 1);
+    ptr_a[2] = vgetq_lane_s64(a.vect_s64[1], 0);
+    ptr_a[3] = vgetq_lane_s64(a.vect_s64[1], 1);
+}
+
+FORCE_INLINE void _mm256_convert_to_uint32(uint32_t* ptr_a, __m256i a)
+{
+    ptr_a[0] = vgetq_lane_u32(a.vect_u32[0], 0);
+    ptr_a[1] = vgetq_lane_u32(a.vect_u32[0], 1);
+    ptr_a[2] = vgetq_lane_u32(a.vect_u32[0], 2);
+    ptr_a[3] = vgetq_lane_u32(a.vect_u32[0], 3);
+    ptr_a[4] = vgetq_lane_u32(a.vect_u32[1], 0);
+    ptr_a[5] = vgetq_lane_u32(a.vect_u32[1], 1);
+    ptr_a[6] = vgetq_lane_u32(a.vect_u32[1], 2);
+    ptr_a[7] = vgetq_lane_u32(a.vect_u32[1], 3);
+}
+
+FORCE_INLINE void _mm256_convert_to_uint64(uint64_t* ptr_a, __m256i a)
+{
+    ptr_a[0] = vgetq_lane_u64(a.vect_u64[0], 0);
+    ptr_a[1] = vgetq_lane_u64(a.vect_u64[0], 1);
+    ptr_a[2] = vgetq_lane_u64(a.vect_u64[1], 0);
+    ptr_a[3] = vgetq_lane_u64(a.vect_u64[1], 1);
+}
+
+FORCE_INLINE __m256i _mm256_div_epi8(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_i128[0] = _mm_div_epi8(a.vect_i128[0], b.vect_i128[0]);
+    res_m256i.vect_i128[1] = _mm_div_epi8(a.vect_i128[1], b.vect_i128[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_div_epi16(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_i128[0] = _mm_div_epi16(a.vect_i128[0], b.vect_i128[0]);
+    res_m256i.vect_i128[1] = _mm_div_epi16(a.vect_i128[1], b.vect_i128[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_div_epi32(__m256i a, __m256i b)
+{
+    __m256i res;
+    int32_t ptr_a[8], ptr_b[8], ptr_r[8];
+    _mm256_convert_to_int32(ptr_a, a);
+    _mm256_convert_to_int32(ptr_b, b);
+    ptr_r[0] = ptr_a[0] / ptr_b[0];
+    ptr_r[1] = ptr_a[1] / ptr_b[1];
+    ptr_r[2] = ptr_a[2] / ptr_b[2];
+    ptr_r[3] = ptr_a[3] / ptr_b[3];
+    ptr_r[4] = ptr_a[4] / ptr_b[4];
+    ptr_r[5] = ptr_a[5] / ptr_b[5];
+    ptr_r[6] = ptr_a[6] / ptr_b[6];
+    ptr_r[7] = ptr_a[7] / ptr_b[7];
+    res.vect_s32[0] = vsetq_lane_s32(ptr_r[0], res.vect_s32[0], 0);
+    res.vect_s32[0] = vsetq_lane_s32(ptr_r[1], res.vect_s32[0], 1);
+    res.vect_s32[0] = vsetq_lane_s32(ptr_r[2], res.vect_s32[0], 2);
+    res.vect_s32[0] = vsetq_lane_s32(ptr_r[3], res.vect_s32[0], 3);
+    res.vect_s32[1] = vsetq_lane_s32(ptr_r[4], res.vect_s32[1], 0);
+    res.vect_s32[1] = vsetq_lane_s32(ptr_r[5], res.vect_s32[1], 1);
+    res.vect_s32[1] = vsetq_lane_s32(ptr_r[6], res.vect_s32[1], 2);
+    res.vect_s32[1] = vsetq_lane_s32(ptr_r[7], res.vect_s32[1], 3);
+    return res;
+}
+
+FORCE_INLINE __m256i _mm256_div_epi64(__m256i a, __m256i b)
+{
+    __m256i res;
+    int64_t ptr_a[4], ptr_b[4], ptr_r[4];
+    _mm256_convert_to_int64(ptr_a, a);
+    _mm256_convert_to_int64(ptr_b, b);
+    ptr_r[0] = ptr_a[0] / ptr_b[0];
+    ptr_r[1] = ptr_a[1] / ptr_b[1];
+    ptr_r[2] = ptr_a[2] / ptr_b[2];
+    ptr_r[3] = ptr_a[3] / ptr_b[3];
+    res.vect_s64[0] = vsetq_lane_s64(ptr_r[0], res.vect_s64[0], 0);
+    res.vect_s64[0] = vsetq_lane_s64(ptr_r[1], res.vect_s64[0], 1);
+    res.vect_s64[1] = vsetq_lane_s64(ptr_r[2], res.vect_s64[1], 0);
+    res.vect_s64[1] = vsetq_lane_s64(ptr_r[3], res.vect_s64[1], 1);
+    return res;
+}
+FORCE_INLINE __m256i _mm256_div_epu8(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_i128[0] = _mm_div_epu8(a.vect_i128[0], b.vect_i128[0]);
+    res_m256i.vect_i128[1] = _mm_div_epu8(a.vect_i128[1], b.vect_i128[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_div_epu16(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_i128[0] = _mm_div_epu16(a.vect_i128[0], b.vect_i128[0]);
+    res_m256i.vect_i128[1] = _mm_div_epu16(a.vect_i128[1], b.vect_i128[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_div_epu32(__m256i a, __m256i b)
+{
+    __m256i res;
+    uint32_t ptr_a[8], ptr_b[8], ptr_r[8];
+    _mm256_convert_to_uint32(ptr_a, a);
+    _mm256_convert_to_uint32(ptr_b, b);
+    ptr_r[0] = ptr_a[0] / ptr_b[0];
+    ptr_r[1] = ptr_a[1] / ptr_b[1];
+    ptr_r[2] = ptr_a[2] / ptr_b[2];
+    ptr_r[3] = ptr_a[3] / ptr_b[3];
+    ptr_r[4] = ptr_a[4] / ptr_b[4];
+    ptr_r[5] = ptr_a[5] / ptr_b[5];
+    ptr_r[6] = ptr_a[6] / ptr_b[6];
+    ptr_r[7] = ptr_a[7] / ptr_b[7];
+    res.vect_u32[0] = vsetq_lane_u32(ptr_r[0], res.vect_u32[0], 0);
+    res.vect_u32[0] = vsetq_lane_u32(ptr_r[1], res.vect_u32[0], 1);
+    res.vect_u32[0] = vsetq_lane_u32(ptr_r[2], res.vect_u32[0], 2);
+    res.vect_u32[0] = vsetq_lane_u32(ptr_r[3], res.vect_u32[0], 3);
+    res.vect_u32[1] = vsetq_lane_u32(ptr_r[4], res.vect_u32[1], 0);
+    res.vect_u32[1] = vsetq_lane_u32(ptr_r[5], res.vect_u32[1], 1);
+    res.vect_u32[1] = vsetq_lane_u32(ptr_r[6], res.vect_u32[1], 2);
+    res.vect_u32[1] = vsetq_lane_u32(ptr_r[7], res.vect_u32[1], 3);
+    return res;
+}
+
+FORCE_INLINE __m256i _mm256_div_epu64(__m256i a, __m256i b)
+{
+    __m256i res;
+    uint64_t ptr_a[4], ptr_b[4], ptr_r[4];
+    _mm256_convert_to_uint64(ptr_a, a);
+    _mm256_convert_to_uint64(ptr_b, b);
+    ptr_r[0] = ptr_a[0] / ptr_b[0];
+    ptr_r[1] = ptr_a[1] / ptr_b[1];
+    ptr_r[2] = ptr_a[2] / ptr_b[2];
+    ptr_r[3] = ptr_a[3] / ptr_b[3];
+    res.vect_u64[0] = vsetq_lane_u64(ptr_r[0], res.vect_u64[0], 0);
+    res.vect_u64[0] = vsetq_lane_u64(ptr_r[1], res.vect_u64[0], 1);
+    res.vect_u64[1] = vsetq_lane_u64(ptr_r[2], res.vect_u64[1], 0);
+    res.vect_u64[1] = vsetq_lane_u64(ptr_r[3], res.vect_u64[1], 1);
+    return res;
+}
+
+FORCE_INLINE __m256 _mm256_div_ps(__m256 a, __m256 b)
+{
+    __m256 res_m256;
+    res_m256.vect_f32[0] = vdivq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m256.vect_f32[1] = vdivq_f32(a.vect_f32[1], b.vect_f32[1]);
+    return res_m256;
+}
+
+FORCE_INLINE __m256d _mm256_div_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vdivq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vdivq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256i _mm256_add_epi8(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s8[0] = vaddq_s8(a.vect_s8[0], b.vect_s8[0]);
+    res_m256i.vect_s8[1] = vaddq_s8(a.vect_s8[1], b.vect_s8[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_add_epi16(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s16[0] = vaddq_s16(a.vect_s16[0], b.vect_s16[0]);
+    res_m256i.vect_s16[1] = vaddq_s16(a.vect_s16[1], b.vect_s16[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_add_epi32(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s32[0] = vaddq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m256i.vect_s32[1] = vaddq_s32(a.vect_s32[1], b.vect_s32[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_add_epi64(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s64[0] = vaddq_s64(a.vect_s64[0], b.vect_s64[0]);
+    res_m256i.vect_s64[1] = vaddq_s64(a.vect_s64[1], b.vect_s64[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_adds_epi8(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s8[0] = vqaddq_s8(a.vect_s8[0], b.vect_s8[0]);
+    res_m256i.vect_s8[1] = vqaddq_s8(a.vect_s8[1], b.vect_s8[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_adds_epi16(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s16[0] = vqaddq_s16(a.vect_s16[0], b.vect_s16[0]);
+    res_m256i.vect_s16[1] = vqaddq_s16(a.vect_s16[1], b.vect_s16[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_adds_epu8(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_u8[0] = vqaddq_u8(a.vect_u8[0], b.vect_u8[0]);
+    res_m256i.vect_u8[1] = vqaddq_u8(a.vect_u8[1], b.vect_u8[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_adds_epu16(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_u16[0] = vqaddq_u16(a.vect_u16[0], b.vect_u16[0]);
+    res_m256i.vect_u16[1] = vqaddq_u16(a.vect_u16[1], b.vect_u16[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256 _mm256_add_ps(__m256 a, __m256 b)
+{
+    __m256 res_m256;
+    res_m256.vect_f32[0] = vaddq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m256.vect_f32[1] = vaddq_f32(a.vect_f32[1], b.vect_f32[1]);
+    return res_m256;
+}
+
+FORCE_INLINE __m256d _mm256_add_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vaddq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vaddq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256 _mm256_addsub_ps (__m256 a, __m256 b)
+{
+    __m256 c;
+    __asm__ __volatile__ (
+        "fsub %2.4s, %0.4s, %4.4s        \n\t"
+        "fsub %3.4s, %1.4s, %5.4s        \n\t"
+        "fadd %0.4s, %0.4s, %4.4s        \n\t"
+        "fadd %1.4s, %1.4s, %5.4s        \n\t"
+        "mov %2.s[1], %0.s[1]           \n\t"
+        "mov %3.s[1], %1.s[1]           \n\t"
+        "mov %2.s[3], %0.s[3]           \n\t"
+        "mov %3.s[3], %1.s[3]           \n\t"
+        :"+w"(a.vect_f32[0]), "+w"(a.vect_f32[1]), "+w"(c.vect_f32[0]), "+w"(c.vect_f32[1])
+        :"w"(b.vect_f32[0]), "w"(b.vect_f32[1])
+    );
+    return c;
+}
+FORCE_INLINE __m256d _mm256_addsub_pd (__m256d a, __m256d b)
+{
+    __m256d c;
+    __asm__ __volatile__ (
+        "fsub %2.2d, %0.2d, %4.2d        \n\t"
+        "fsub %3.2d, %1.2d, %5.2d        \n\t"
+        "fadd %0.2d, %0.2d, %4.2d        \n\t"
+        "fadd %1.2d, %1.2d, %5.2d        \n\t"
+        "mov %2.d[1], %0.d[1]           \n\t"
+        "mov %3.d[1], %1.d[1]           \n\t"
+        :"+w"(a.vect_f64[0]), "+w"(a.vect_f64[1]), "+w"(c.vect_f64[0]), "+w"(c.vect_f64[1])
+        :"w"(b.vect_f64[0]), "w"(b.vect_f64[1])
+    );
+    return c;
+}
+
+FORCE_INLINE __m256i _mm256_sub_epi16 (__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s16[0] = vsubq_s16(a.vect_s16[0], b.vect_s16[0]);
+    res_m256i.vect_s16[1] = vsubq_s16(a.vect_s16[1], b.vect_s16[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_sub_epi32(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s32[0] = vsubq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m256i.vect_s32[1] = vsubq_s32(a.vect_s32[1], b.vect_s32[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_sub_epi64(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s64[0] = vsubq_s64(a.vect_s64[0], b.vect_s64[0]);
+    res_m256i.vect_s64[1] = vsubq_s64(a.vect_s64[1], b.vect_s64[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_sub_epi8(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s8[0] = vsubq_s8(a.vect_s8[0], b.vect_s8[0]);
+    res_m256i.vect_s8[1] = vsubq_s8(a.vect_s8[1], b.vect_s8[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256d _mm256_sub_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vsubq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vsubq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256 _mm256_sub_ps(__m256 a, __m256 b)
+{
+    __m256 res_m256;
+    res_m256.vect_f32[0] = vsubq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m256.vect_f32[1] = vsubq_f32(a.vect_f32[1], b.vect_f32[1]);
+    return res_m256;
+}
+
+FORCE_INLINE __m256i _mm256_subs_epi16(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s16[0] = vqsubq_s16(a.vect_s16[0], b.vect_s16[0]);
+    res_m256i.vect_s16[1] = vqsubq_s16(a.vect_s16[1], b.vect_s16[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_subs_epi8(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s8[0] = vqsubq_s8(a.vect_s8[0], b.vect_s8[0]);
+    res_m256i.vect_s8[1] = vqsubq_s8(a.vect_s8[1], b.vect_s8[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_subs_epu16(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_u16[0] = vqsubq_u16(a.vect_u16[0], b.vect_u16[0]);
+    res_m256i.vect_u16[1] = vqsubq_u16(a.vect_u16[1], b.vect_u16[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_subs_epu8(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_u8[0] = vqsubq_u8(a.vect_u8[0], b.vect_u8[0]);
+    res_m256i.vect_u8[1] = vqsubq_u8(a.vect_u8[1], b.vect_u8[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_mul_epi32(__m256i a, __m256i b)
+{
+    __asm__ __volatile__ (
+        "ins %[a0].s[1], %[a0].s[2]             \n\t"
+        "ins %[a1].s[1], %[a1].s[2]             \n\t"
+        "ins %[b0].s[1], %[b0].s[2]             \n\t"
+        "ins %[b1].s[1], %[b1].s[2]             \n\t"
+        "smull %[a0].2d, %[a0].2s, %[b0].2s     \n\t"
+        "smull %[a1].2d, %[a1].2s, %[b1].2s     \n\t"
+        :[a0]"+w"(a.vect_s32[0]), [a1]"+w"(a.vect_s32[1]), [b0]"+w"(b.vect_s32[0]), [b1]"+w"(b.vect_s32[1])
+        :
+        :
+    );
+    return a;
+}
+
+FORCE_INLINE __m256i _mm256_mul_epu32(__m256i a, __m256i b)
+{
+    __asm__ __volatile__ (
+        "ins %[a0].s[1], %[a0].s[2]             \n\t"
+        "ins %[a1].s[1], %[a1].s[2]             \n\t"
+        "ins %[b0].s[1], %[b0].s[2]             \n\t"
+        "ins %[b1].s[1], %[b1].s[2]             \n\t"
+        "umull %[a0].2d, %[a0].2s, %[b0].2s     \n\t"
+        "umull %[a1].2d, %[a1].2s, %[b1].2s     \n\t"
+        :[a0]"+w"(a.vect_u32[0]), [a1]"+w"(a.vect_u32[1]), [b0]"+w"(b.vect_u32[0]), [b1]"+w"(b.vect_u32[1])
+        :
+        :
+    );
+    return a;
+}
+
+FORCE_INLINE __m256d _mm256_mul_pd(__m256d a, __m256d b)
+{
+    __m256d res_m256d;
+    res_m256d.vect_f64[0] = vmulq_f64(a.vect_f64[0], b.vect_f64[0]);
+    res_m256d.vect_f64[1] = vmulq_f64(a.vect_f64[1], b.vect_f64[1]);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256 _mm256_mul_ps(__m256 a, __m256 b)
+{
+    __m256 res_m256;
+    res_m256.vect_f32[0] = vmulq_f32(a.vect_f32[0], b.vect_f32[0]);
+    res_m256.vect_f32[1] = vmulq_f32(a.vect_f32[1], b.vect_f32[1]);
+    return res_m256;
+}
+
+FORCE_INLINE __m256i _mm256_mulhi_epi16(__m256i a, __m256i b)
+{
+    __asm__ __volatile__ (
+        "mov v4.d[0], %[a0].d[1]                    \n\t"
+        "mov v5.d[0], %[a1].d[1]                    \n\t"
+        "smull %[a0].4s, %[a0].4h, %[b0].4h         \n\t"
+        "mov v6.d[0], %[b0].d[1]                    \n\t"
+        "mov v7.d[0], %[b1].d[1]                    \n\t"
+        "smull %[b0].4s, v4.4h, v6.4h               \n\t"
+        "uzp2  %[a0].8h, %[a0].8h, %[b0].8h         \n\t"
+        "smull %[a1].4s, %[a1].4h, %[b1].4h         \n\t"
+        "smull %[b1].4s, v5.4h, v7.4h               \n\t"
+        "uzp2  %[a1].8h, %[a1].8h, %[b1].8h         \n\t"
+        :[a0]"+w"(a.vect_s16[0]), [a1]"+w"(a.vect_s16[1]), [b0]"+w"(b.vect_s16[0]), [b1]"+w"(b.vect_s16[1])
+        :
+        :"v4", "v5", "v6", "v7"
+    );
+    return a;
+}
+
+FORCE_INLINE __m256i _mm256_mulhi_epu16(__m256i a, __m256i b)
+{
+    __asm__ __volatile__ (
+        "mov v4.d[0], %[a0].d[1]                    \n\t"
+        "mov v5.d[0], %[a1].d[1]                    \n\t"
+        "umull %[a0].4s, %[a0].4h, %[b0].4h         \n\t"
+        "mov v6.d[0], %[b0].d[1]                    \n\t"
+        "mov v7.d[0], %[b1].d[1]                    \n\t"
+        "umull %[b0].4s, v4.4h, v6.4h               \n\t"
+        "uzp2  %[a0].8h, %[a0].8h, %[b0].8h         \n\t"
+        "umull %[a1].4s, %[a1].4h, %[b1].4h         \n\t"
+        "umull %[b1].4s, v5.4h, v7.4h               \n\t"
+        "uzp2  %[a1].8h, %[a1].8h, %[b1].8h         \n\t"
+        :[a0]"+w"(a.vect_u16[0]), [a1]"+w"(a.vect_u16[1]), [b0]"+w"(b.vect_u16[0]), [b1]"+w"(b.vect_u16[1])
+        :
+        :"v4", "v5", "v6", "v7"
+    );
+    return a;
+}
+
+FORCE_INLINE __m256i _mm256_mulhi_epi32(__m256i a, __m256i b)
+{
+    __asm__ __volatile__ (
+        "mov v4.d[0], %[a0].d[1]                    \n\t"
+        "mov v5.d[0], %[a1].d[1]                    \n\t"
+        "smull %[a0].2d, %[a0].2s, %[b0].2s         \n\t"
+        "mov v6.d[0], %[b0].d[1]                    \n\t"
+        "mov v7.d[0], %[b1].d[1]                    \n\t"
+        "smull %[b0].2d, v4.2s, v6.2s               \n\t"
+        "uzp2  %[a0].4s, %[a0].4s, %[b0].4s         \n\t"
+        "smull %[a1].2d, %[a1].2s, %[b1].2s         \n\t"
+        "smull %[b1].2d, v5.2s, v7.2s               \n\t"
+        "uzp2  %[a1].4s, %[a1].4s, %[b1].4s         \n\t"
+        :[a0]"+w"(a.vect_s32[0]), [a1]"+w"(a.vect_s32[1]), [b0]"+w"(b.vect_s32[0]), [b1]"+w"(b.vect_s32[1])
+        :
+        :"v4", "v5", "v6", "v7"
+    );
+    return a;
+}
+
+FORCE_INLINE __m256i _mm256_mulhi_epu32(__m256i a, __m256i b)
+{
+    __asm__ __volatile__ (
+        "mov v4.d[0], %[a0].d[1]                    \n\t"
+        "mov v5.d[0], %[a1].d[1]                    \n\t"
+        "umull %[a0].2d, %[a0].2s, %[b0].2s         \n\t"
+        "mov v6.d[0], %[b0].d[1]                    \n\t"
+        "mov v7.d[0], %[b1].d[1]                    \n\t"
+        "umull %[b0].2d, v4.2s, v6.2s               \n\t"
+        "uzp2  %[a0].4s, %[a0].4s, %[b0].4s         \n\t"
+        "umull %[a1].2d, %[a1].2s, %[b1].2s         \n\t"
+        "umull %[b1].2d, v5.2s, v7.2s               \n\t"
+        "uzp2  %[a1].4s, %[a1].4s, %[b1].4s         \n\t"
+        :[a0]"+w"(a.vect_s32[0]), [a1]"+w"(a.vect_s32[1]), [b0]"+w"(b.vect_s32[0]), [b1]"+w"(b.vect_s32[1])
+        :
+        :"v4", "v5", "v6", "v7"
+    );
+    return a;
+}
+
+FORCE_INLINE __m256i _mm256_mullo_epi16(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s16[0] = vmulq_s16(a.vect_s16[0], b.vect_s16[0]);
+    res_m256i.vect_s16[1] = vmulq_s16(a.vect_s16[1], b.vect_s16[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_mullo_epi32(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s32[0] = vmulq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m256i.vect_s32[1] = vmulq_s32(a.vect_s32[1], b.vect_s32[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_mullo_epi64(__m256i a, __m256i b)
+{
+    __m256i res;
+    int64_t ptr_a[4], ptr_b[4], ptr_r[4];
+    _mm256_convert_to_int64(ptr_a, a);
+    _mm256_convert_to_int64(ptr_b, b);
+    ptr_r[0] = ptr_a[0] * ptr_b[0];
+    ptr_r[1] = ptr_a[1] * ptr_b[1];
+    ptr_r[2] = ptr_a[2] * ptr_b[2];
+    ptr_r[3] = ptr_a[3] * ptr_b[3];
+    res.vect_s64[0] = vsetq_lane_s64(ptr_r[0], res.vect_s64[0], 0);
+    res.vect_s64[0] = vsetq_lane_s64(ptr_r[1], res.vect_s64[0], 1);
+    res.vect_s64[1] = vsetq_lane_s64(ptr_r[2], res.vect_s64[1], 0);
+    res.vect_s64[1] = vsetq_lane_s64(ptr_r[3], res.vect_s64[1], 1);
+    return res;
+}
+
+FORCE_INLINE __m256i _mm256_mulhrs_epi16(__m256i a, __m256i b)
+{
+    __m256i res;
+    int32x4_t r_0 = vmull_s16(vget_low_s16(a.vect_s16[0]), vget_low_s16(b.vect_s16[0]));
+    int32x4_t r_1 = vmull_s16(vget_low_s16(a.vect_s16[1]), vget_low_s16(b.vect_s16[1]));
+    int32x4_t r_2 = vmull_s16(vget_high_s16(a.vect_s16[0]), vget_high_s16(b.vect_s16[0]));
+    int32x4_t r_3 = vmull_s16(vget_high_s16(a.vect_s16[1]), vget_high_s16(b.vect_s16[1]));
+    
+    int32x4_t inc = vdupq_n_s32(0x00004000);
+    r_0 = vshrq_n_s32(vaddq_s32(r_0, inc), 15);
+    r_1 = vshrq_n_s32(vaddq_s32(r_1, inc), 15);
+    r_2 = vshrq_n_s32(vaddq_s32(r_2, inc), 15);
+    r_3 = vshrq_n_s32(vaddq_s32(r_3, inc), 15);
+    res.vect_s16[0] = vuzp1q_s16(vreinterpretq_s16_s32(r_0), vreinterpretq_s16_s32(r_2));
+    res.vect_s16[1] = vuzp1q_s16(vreinterpretq_s16_s32(r_1), vreinterpretq_s16_s32(r_3));
+    return res;
+}
+
+FORCE_INLINE void _mm256_zeroupper(void)
+{
+    return;  // 256位寄存器高128位置0，arm寄存器最大128位，该接口不作为
+}
+
+FORCE_INLINE __m256i _mm256_sll_epi32(__m256i a, __m128i count)
+{
+    long long c = count.vect_s64[0];
+    __m256i result_m256i;
+    if (likely(c >= 0 && c < 32)) {
+        result_m256i.vect_s32[0] = vshlq_n_s32(a.vect_s32[0], c);
+        result_m256i.vect_s32[1] = vshlq_n_s32(a.vect_s32[1], c);
+    } else {
+        result_m256i.vect_s32[0] = vdupq_n_s32(0);
+        result_m256i.vect_s32[1] = vdupq_n_s32(0);
+    } 
+    return result_m256i;
+}
+FORCE_INLINE __m256i _mm256_sll_epi64(__m256i a, __m128i count)
+{
+    long long c = count.vect_s64[0];
+    __m256i result_m256i;
+    if (likely(c >= 0 && c < 64)) {
+        result_m256i.vect_s64[0] = vshlq_n_s64(a.vect_s64[0], c);
+        result_m256i.vect_s64[1] = vshlq_n_s64(a.vect_s64[1], c);
+    } else {
+        result_m256i.vect_s64[0] = vdupq_n_s64(0);
+        result_m256i.vect_s64[1] = vdupq_n_s64(0);
+    } 
+    return result_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_slli_epi32(__m256i a, int imm8)
+{
+    __m256i result_m256i;
+    if (likely(imm8 >= 0 && imm8 < 32)) {
+        result_m256i.vect_s32[0] = vshlq_n_s32(a.vect_s32[0], imm8);
+        result_m256i.vect_s32[1] = vshlq_n_s32(a.vect_s32[1], imm8);
+    } else {
+        result_m256i.vect_s32[0] = vdupq_n_s32(0);
+        result_m256i.vect_s32[1] = vdupq_n_s32(0);
+    } 
+    return result_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_slli_epi64(__m256i a, int imm8)
+{
+    __m256i result_m256i;
+    if (likely(imm8 >= 0 && imm8 < 64)) {
+        result_m256i.vect_s64[0] = vshlq_n_s64(a.vect_s64[0], imm8);
+        result_m256i.vect_s64[1] = vshlq_n_s64(a.vect_s64[1], imm8);
+    } else {
+        result_m256i.vect_s64[0] = vdupq_n_s64(0);
+        result_m256i.vect_s64[1] = vdupq_n_s64(0);
+    } 
+    return result_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_srli_epi64(__m256i a, int imm8)
+{
+    __m256i result_m256i;
+    
+    if (likely(imm8 >= 0 && imm8 < 64)) {
+        int64x2_t vect_imm = vdupq_n_s64(-imm8);
+        result_m256i.vect_u64[0] = vshlq_u64(a.vect_u64[0], vect_imm);
+        result_m256i.vect_u64[1] = vshlq_u64(a.vect_u64[1], vect_imm);
+    } else {
+        result_m256i.vect_u64[0] = vdupq_n_u64(0);
+        result_m256i.vect_u64[1] = vdupq_n_u64(0);
+    } 
+    return result_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_slli_si256(__m256i a, const int imm8)
+{
+    assert(imm8 >=0 && imm8 <256);
+    __m256i result_m256i;
+    if (likely(imm8 > 0 && imm8 <= 15)) {
+        result_m256i.vect_s8[0] = vextq_s8(vdupq_n_s8(0), a.vect_s8[0], 16 - imm8);
+        result_m256i.vect_s8[1] = vextq_s8(vdupq_n_s8(0), a.vect_s8[1], 16 - imm8);
+    } else if (imm8 == 0) {
+        result_m256i = a;
+    } else {
+        result_m256i.vect_s8[0] = vdupq_n_s8(0);
+        result_m256i.vect_s8[1] = vdupq_n_s8(0);
+    }
+    return result_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_srli_si256(__m256i a, const int imm8)
+{
+    assert(imm8 >=0 && imm8 <256);
+    __m256i result_m256i;
+    if (likely(imm8 > 0 && imm8 <= 15)) {
+        result_m256i.vect_s8[0] = vextq_s8(a.vect_s8[0], vdupq_n_s8(0), imm8);
+        result_m256i.vect_s8[1] = vextq_s8(a.vect_s8[1], vdupq_n_s8(0), imm8);
+    } else if (imm8 == 0) {
+        result_m256i = a;
+    } else {
+        result_m256i.vect_s8[0] = vdupq_n_s8(0);
+        result_m256i.vect_s8[1] = vdupq_n_s8(0);
+    } 
+    return result_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_unpackhi_epi8(__m256i a, __m256i b)
+{
+    __m256i result_m256i;
+    result_m256i.vect_s8[0] = vzip2q_s8(a.vect_s8[0], b.vect_s8[0]);
+    result_m256i.vect_s8[1] = vzip2q_s8(a.vect_s8[1], b.vect_s8[1]);
+    return result_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_unpacklo_epi8(__m256i a, __m256i b)
+{
+    __m256i result_m256i;
+    result_m256i.vect_s8[0] = vzip1q_s8(a.vect_s8[0], b.vect_s8[0]);
+    result_m256i.vect_s8[1] = vzip1q_s8(a.vect_s8[1], b.vect_s8[1]);
+    return result_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_and_si256(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s32[0] = vandq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m256i.vect_s32[1] = vandq_s32(a.vect_s32[1], b.vect_s32[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_andnot_si256(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s32[0] = vbicq_s32(b.vect_s32[0], a.vect_s32[0]);
+    res_m256i.vect_s32[1] = vbicq_s32(b.vect_s32[1], a.vect_s32[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_xor_si256(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s32[0] = veorq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m256i.vect_s32[1] = veorq_s32(a.vect_s32[1], b.vect_s32[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256 _mm256_or_ps (__m256 a, __m256 b)
+{
+    __asm__ __volatile(
+        "orr %0.16b, %0.16b, %2.16b     \n\t"
+        "orr %1.16b, %1.16b, %3.16b     \n\t"
+        :"+w"(a.vect_f32[0]), "+w"(a.vect_f32[1])
+        :"w"(b.vect_f32[0]), "w"(b.vect_f32[1])
+    );
+    return a;
+}
+
+FORCE_INLINE __m256d _mm256_or_pd (__m256d a, __m256d b)
+{
+    __asm__ __volatile(
+        "orr %0.16b, %0.16b, %2.16b     \n\t"
+        "orr %1.16b, %1.16b, %3.16b     \n\t"
+        :"+w"(a.vect_f64[0]), "+w"(a.vect_f64[1])
+        :"w"(b.vect_f64[0]), "w"(b.vect_f64[1])
+    );
+    return a;
+}
+
+FORCE_INLINE int _mm256_movemask_epi8 (__m256i a)
+{
+    int res;
+    __asm__ __volatile__ (
+        "ushr %[a0].16b, %[a0].16b, #7          \n\t"
+        "ushr %[a1].16b, %[a1].16b, #7          \n\t"
+        "usra %[a0].8h, %[a0].8h, #7            \n\t"
+        "usra %[a1].8h, %[a1].8h, #7            \n\t"
+        "usra %[a0].4s, %[a0].4s, #14           \n\t"
+        "usra %[a1].4s, %[a1].4s, #14           \n\t"
+        "usra %[a0].2d, %[a0].2d, #28           \n\t"
+        "usra %[a1].2d, %[a1].2d, #28           \n\t"
+        "ins %[a0].b[1], %[a0].b[8]             \n\t"
+        "ins %[a0].b[2], %[a1].b[0]             \n\t"
+        "ins %[a0].b[3], %[a1].b[8]             \n\t"
+        "umov %w[r], %[a0].s[0]"
+        :[r]"=r"(res), [a0]"+w"(a.vect_u8[0]), [a1]"+w"(a.vect_u8[1])
+        :
+        :
+    );
+    return res;
+}
+
+FORCE_INLINE int _mm256_movemask_ps(__m256 a)
+{
+    __m256i res_m256i;
+    res_m256i.vect_u32[0] = vshrq_n_u32(vreinterpretq_u32_f32(a.vect_f32[0]), 31);
+    res_m256i.vect_u32[1] = vshrq_n_u32(vreinterpretq_u32_f32(a.vect_f32[1]), 31);
+    res_m256i.vect_u64[0] = vsraq_n_u64(res_m256i.vect_u64[0], res_m256i.vect_u64[0], 31);
+    res_m256i.vect_u64[1] = vsraq_n_u64(res_m256i.vect_u64[1], res_m256i.vect_u64[1], 31);
+    return (int)(vgetq_lane_u8(res_m256i.vect_u8[0], 0) | (vgetq_lane_u8(res_m256i.vect_u8[0], 8) << 2) |
+                (vgetq_lane_u8(res_m256i.vect_u8[1], 0) << 4) | (vgetq_lane_u8(res_m256i.vect_u8[1], 8) << 6));
+}
+
+FORCE_INLINE int _mm256_testz_si256(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s64[0] = vandq_s64(a.vect_s64[0], b.vect_s64[0]);
+    res_m256i.vect_s64[1] = vandq_s64(a.vect_s64[1], b.vect_s64[1]);
+    int64x2_t tmp = vorrq_s64(res_m256i.vect_s64[0], res_m256i.vect_s64[1]);
+    return !(vgetq_lane_s64(tmp, 0) | vgetq_lane_s64(tmp, 1));
+}
+
+FORCE_INLINE __m256i _mm256_or_si256(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s32[0] = vorrq_s32(a.vect_s32[0], b.vect_s32[0]);
+    res_m256i.vect_s32[1] = vorrq_s32(a.vect_s32[1], b.vect_s32[1]);
+    return res_m256i;
+}
+
+FORCE_INLINE __m128i _mm256_extracti128_si256(__m256i a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    __m128i res_m128i;
+    res_m128i.vect_s64 = a.vect_s64[imm8];
+    return res_m128i;
+}
+
+FORCE_INLINE __int32 _mm256_extract_epi32 (__m256i a, const int index)
+{
+    assert(index >= 0 && index <= 7);
+    return a.vect_s32[!!(index & 0x04)][index & 0x03];
+}
+
+FORCE_INLINE __int64 _mm256_extract_epi64 (__m256i a, const int index)
+{
+    assert(index >= 0 && index <= 3);
+    return a.vect_s64[!!(index & 0x02)][index & 0x01];
+}
+
+FORCE_INLINE __m128 _mm256_extractf128_ps (__m256 a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.vect_f32[imm8];
+}
+
+FORCE_INLINE __m128d _mm256_extractf128_pd (__m256d a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.vect_f64[imm8];
+}
+
+FORCE_INLINE __m256i _mm256_permute4x64_epi64(__m256i a, const int imm8)
+{
+    __m256i res;
+    int64_t ptr_a[4];
+    vst1q_s64(ptr_a, a.vect_s64[0]);
+    vst1q_s64(ptr_a + 2, a.vect_s64[1]);
+    const int id0 = imm8 & 0x03;
+    const int id1 = (imm8 >> 2) & 0x03;
+    const int id2 = (imm8 >> 4) & 0x03;
+    const int id3 = (imm8 >> 6) & 0x03;
+    res.vect_s64[0] = vsetq_lane_s64(ptr_a[id0], res.vect_s64[0], 0);
+    res.vect_s64[0] = vsetq_lane_s64(ptr_a[id1], res.vect_s64[0], 1);
+    res.vect_s64[1] = vsetq_lane_s64(ptr_a[id2], res.vect_s64[1], 0);
+    res.vect_s64[1] = vsetq_lane_s64(ptr_a[id3], res.vect_s64[1], 1);
+    return res;
+}
+
+FORCE_INLINE __m256i _mm256_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
+{
+    __m256i res_m256i;
+    SET32x4(res_m256i.vect_s32[0], e0, e1, e2, e3);
+    SET32x4(res_m256i.vect_s32[1], e4, e5, e6, e7);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_set_epi64x(int64_t e3, int64_t e2, int64_t e1, int64_t e0)
+{
+    __m256i res_m256i;
+    SET64x2(res_m256i.vect_s64[0], e0, e1);
+    SET64x2(res_m256i.vect_s64[1], e2, e3);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_set_m128i(__m128i hi, __m128i lo)
+{
+    __m256i res_m256i;
+    res_m256i.vect_s32[0] = lo.vect_s32;
+    res_m256i.vect_s32[1] = hi.vect_s32;
+    return res_m256i;
+}
+
+FORCE_INLINE __m256 _mm256_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
+{
+    __m256 res_m256;
+    SET32x4(res_m256.vect_f32[0], e0, e1, e2, e3);
+    SET32x4(res_m256.vect_f32[1], e4, e5, e6, e7);
+    return res_m256;
+}
+
+FORCE_INLINE __m256d _mm256_set_pd(double e3, double e2, double e1, double e0)
+{
+    __m256d res_m256d;
+    SET64x2(res_m256d.vect_f64[0], e0, e1);
+    SET64x2(res_m256d.vect_f64[1], e2, e3);
+    return res_m256d;
+}
+
+FORCE_INLINE __m256i _mm256_setzero_si256(void)
+{
+    __m256i ret;
+    ret.vect_s32[0] = ret.vect_s32[1] = vdupq_n_s32(0);
+    return ret;
+}
+
+FORCE_INLINE __m256 _mm256_setzero_ps(void)
+{
+    __m256 ret;
+    ret.vect_f32[0] = ret.vect_f32[1] = vdupq_n_f32(0.0f);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_setzero_pd(void)
+{
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(0.0);
+    return ret;
+}
+
+FORCE_INLINE __m256i _mm256_set1_epi8(int8_t a)
+{
+    __m256i ret;
+    ret.vect_s8[0] = ret.vect_s8[1] = vdupq_n_s8(a);
+    return ret;
+}
+
+FORCE_INLINE __m256i _mm256_set1_epi32(int32_t a)
+{
+    __m256i ret;
+    ret.vect_s32[0] = ret.vect_s32[1] = vdupq_n_s32(a);
+    return ret;
+}
+
+FORCE_INLINE __m256i _mm256_set1_epi64x(int64_t a)
+{
+    __m256i ret;
+    ret.vect_s64[0] = ret.vect_s64[1] = vdupq_n_s64(a);
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_set1_pd(double a)
+{
+    __m256d ret;
+    ret.vect_f64[0] = ret.vect_f64[1] = vdupq_n_f64(a);
+    return ret;
+}
+
+FORCE_INLINE __m256 _mm256_set1_ps(float a)
+{
+    __m256 ret;
+    ret.vect_f32[0] = ret.vect_f32[1] = vdupq_n_f32(a);
+    return ret;
+}
+
+FORCE_INLINE void _mm256_store_si256(__m256i* mem_addr, __m256i a)
+{
+    vst1q_s32((int32_t*)mem_addr, a.vect_s32[0]);
+    vst1q_s32((int32_t*)mem_addr + 4, a.vect_s32[1]);
+}
+
+FORCE_INLINE void _mm256_storeu_si256(__m256i* mem_addr, __m256i a)
+{
+    vst1q_s8((int8_t*)mem_addr, a.vect_s8[0]);
+    vst1q_s8((int8_t*)mem_addr + 16, a.vect_s8[1]);
+}
+
+FORCE_INLINE __m256i _mm256_load_si256(__m256i const* mem_addr)
+{
+    __m256i ret;
+    ret.vect_s32[0] = vld1q_s32((int32_t const*)mem_addr);
+    ret.vect_s32[1] = vld1q_s32(((int32_t const*)mem_addr) + 4);
+    return ret;
+}
+
+FORCE_INLINE __m256i _mm256_loadu_si256(__m256i const* mem_addr)
+{
+    __m256i ret;
+    ret.vect_s32[0] = vld1q_s32((int32_t const*)mem_addr);
+    ret.vect_s32[1] = vld1q_s32(((int32_t const*)mem_addr) + 4);
+    return ret;
+}
+
+FORCE_INLINE __m256i _mm256_maskload_epi32(int const* mem_addr, __m256i mask)
+{
+    __m256i ret;
+
+    int32x4_t vecZero = vdupq_n_s32(0);
+    __m128i flag;
+
+    flag.vect_u32 = vcltq_s32(mask.vect_s32[0], vecZero);
+    ret.vect_s32[0] = vandq_s32(flag.vect_s32, vld1q_s32(mem_addr));
+
+    flag.vect_u32 = vcltq_s32(mask.vect_s32[1], vecZero);
+    ret.vect_s32[1] = vandq_s32(flag.vect_s32, vld1q_s32(mem_addr + 4));
+
+    return ret;
+}
+
+FORCE_INLINE __m256i _mm256_broadcastq_epi64(__m128i a)
+{
+    __m256i res;
+    __asm__ __volatile__ (
+        "dup %[r0].2d, %[a].d[0]           \n\t"
+        "dup %[r1].2d, %[a].d[0]           \n\t"
+        :[r0]"=w"(res.vect_s64[0]), [r1]"=w"(res.vect_s64[1])
+        :[a]"w"(a.vect_s64)
+        :
+    );
+    return res;
+}
+
+FORCE_INLINE __m256i _mm256_broadcastsi128_si256(__m128i a)
+{
+    __m256i ret;
+
+    ret.vect_s32[0] = a.vect_s32;
+    ret.vect_s32[1] = a.vect_s32;
+
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_castpd128_pd256(__m128d a)
+{
+    __m256d res;
+    res.vect_f64[0] = a;
+    return res;
+}
+
+FORCE_INLINE __m128d _mm256_castpd256_pd128(__m256d a)
+{
+    return a.vect_f64[0];
+}
+
+FORCE_INLINE __m256 _mm256_castps128_ps256(__m128 a)
+{
+    __m256 res;
+    res.vect_f32[0] = a;
+    return res;
+}
+
+FORCE_INLINE __m128 _mm256_castps256_ps128(__m256 a)
+{
+    return a.vect_f32[0];
+}
+
+FORCE_INLINE __m256i _mm256_castsi128_si256(__m128i a)
+{
+    __m256i res;
+    res.vect_s32[0] = a.vect_s32;
+    return res;
+}
+
+FORCE_INLINE __m256 _mm256_castsi256_ps(__m256i a)
+{
+    __m256 b;
+
+    b.vect_f32[0] = vreinterpretq_f32_s32(a.vect_s32[0]);
+    b.vect_f32[1] = vreinterpretq_f32_s32(a.vect_s32[1]);
+
+    return b;
+}
+
+FORCE_INLINE __m128i _mm256_castsi256_si128(__m256i a)
+{
+    __m128i ret;
+    ret.vect_s32 = a.vect_s32[0];
+    return ret;
+}
+
+FORCE_INLINE __m256d _mm256_cvtepi32_pd(__m128i a)
+{
+    __m256d res;
+    __asm__ __volatile__ (
+        "scvtf v0.4s, %[a].4s           \n\t"
+        "fcvtl %[r0].2d, v0.2s          \n\t"
+        "mov v1.d[0], v0.d[1]           \n\t"
+        "fcvtl %[r1].2d, v1.2s          \n\t"
+        :[r0]"=w"(res.vect_f64[0]), [r1]"=w"(res.vect_f64[1])
+        :[a]"w"(a.vect_s32)
+        :"v0", "v1"
+    );
+    return res;
+}
+
+FORCE_INLINE __m256 _mm256_cvtepi32_ps(__m256i a)
+{
+    __m256 ret;
+    ret.vect_f32[0] = vcvtq_f32_s32(a.vect_s32[0]);
+    ret.vect_f32[1] = vcvtq_f32_s32(a.vect_s32[1]);
+    return ret;
+}
+
+FORCE_INLINE __m256i _mm256_shuffle_epi8(__m256i a, __m256i b)
+{
+    __m256i res_m256i;
+    uint8x16_t mask_and = vdupq_n_u8(0x8f);
+    res_m256i.vect_u8[0] = vqtbl1q_u8(a.vect_u8[0], vandq_u8(b.vect_u8[0], mask_and));
+    res_m256i.vect_u8[1] = vqtbl1q_u8(a.vect_u8[1], vandq_u8(b.vect_u8[1], mask_and));
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_multishift_epi64_epi8(__m256i a, __m256i b)
+{
+    __m256i res_m256i, tmp0, tmp1, tb0, tb1, sft0, sft1;
+    uint8x16_t low3bit = vdupq_n_u8(0x07);
+    uint8x16_t inc = vdupq_n_u8(0x01);
+    
+    tmp0.vect_u8[0] = vshrq_n_u8(a.vect_u8[0], 3);
+    tmp0.vect_u8[1] = vshrq_n_u8(a.vect_u8[1], 3);
+    tmp1.vect_u8[0] = vaddq_u8(tmp0.vect_u8[0], inc);
+    tmp1.vect_u8[1] = vaddq_u8(tmp0.vect_u8[1], inc);
+    tmp0.vect_u8[0] = vandq_u8(tmp0.vect_u8[0], low3bit);
+    tmp0.vect_u8[1] = vandq_u8(tmp0.vect_u8[1], low3bit);
+    tmp1.vect_u8[0] = vandq_u8(tmp1.vect_u8[0], low3bit);
+    tmp1.vect_u8[1] = vandq_u8(tmp1.vect_u8[1], low3bit);
+    inc = vcombine_u8(vdup_n_u8(0x00), vdup_n_u8(0x08));
+    tmp0.vect_u8[0] = vaddq_u8(tmp0.vect_u8[0], inc);
+    tmp0.vect_u8[1] = vaddq_u8(tmp0.vect_u8[1], inc);
+    tmp1.vect_u8[0] = vaddq_u8(tmp1.vect_u8[0], inc);
+    tmp1.vect_u8[1] = vaddq_u8(tmp1.vect_u8[1], inc);
+    a.vect_u8[0] = vandq_u8(a.vect_u8[0], low3bit);
+    a.vect_u8[1] = vandq_u8(a.vect_u8[1], low3bit);
+
+    tb0.vect_u8[0] = vqtbl1q_u8(b.vect_u8[0], tmp0.vect_u8[0]);
+    tb0.vect_u8[1] = vqtbl1q_u8(b.vect_u8[1], tmp0.vect_u8[1]);
+    tb1.vect_u8[0] = vqtbl1q_u8(b.vect_u8[0], tmp1.vect_u8[0]);
+    tb1.vect_u8[1] = vqtbl1q_u8(b.vect_u8[1], tmp1.vect_u8[1]);
+    tmp0.vect_u8[0] = vzip1q_u8(tb0.vect_u8[0], tb1.vect_u8[0]);
+    tmp0.vect_u8[1] = vzip2q_u8(tb0.vect_u8[0], tb1.vect_u8[0]);
+    tmp1.vect_u8[0] = vzip1q_u8(tb0.vect_u8[1], tb1.vect_u8[1]);
+    tmp1.vect_u8[1] = vzip2q_u8(tb0.vect_u8[1], tb1.vect_u8[1]);
+
+    uint8x8_t a0_low64_bit = vget_low_u8(a.vect_u8[0]);
+    uint8x8_t a0_high64_bit = vget_high_u8(a.vect_u8[0]);
+    uint8x8_t a1_low64_bit = vget_low_u8(a.vect_u8[1]);
+    uint8x8_t a1_high64_bit = vget_high_u8(a.vect_u8[1]);
+    sft0.vect_u16[0] = vmovl_u8(a0_low64_bit);
+    sft0.vect_u16[1] = vmovl_u8(a0_high64_bit);
+    sft1.vect_u16[0] = vmovl_u8(a1_low64_bit);
+    sft1.vect_u16[1] = vmovl_u8(a1_high64_bit);
+    sft0.vect_s16[0] = vnegq_s16(sft0.vect_s16[0]);
+    sft0.vect_s16[1] = vnegq_s16(sft0.vect_s16[1]);
+    sft1.vect_s16[0] = vnegq_s16(sft1.vect_s16[0]);
+    sft1.vect_s16[1] = vnegq_s16(sft1.vect_s16[1]);
+    tmp0.vect_u16[0] = vshlq_u16(tmp0.vect_u16[0], sft0.vect_s16[0]);
+    tmp0.vect_u16[1] = vshlq_u16(tmp0.vect_u16[1], sft0.vect_s16[1]);
+    tmp1.vect_u16[0] = vshlq_u16(tmp1.vect_u16[0], sft1.vect_s16[0]);
+    tmp1.vect_u16[1] = vshlq_u16(tmp1.vect_u16[1], sft1.vect_s16[1]);
+    res_m256i.vect_u8[0] = vuzp1q_u8(tmp0.vect_u8[0], tmp0.vect_u8[1]);
+    res_m256i.vect_u8[1] = vuzp1q_u8(tmp1.vect_u8[0], tmp1.vect_u8[1]);
+
+    return res_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_alignr_epi8(__m256i a, __m256i b, const int count)
+{
+    __m256i res_m256i;
+    int8x16_t tmp0[3], tmp1[3];
+    int shift = count > 32 ? 32 : count;
+    tmp0[2] = vdupq_n_s8(0);
+    tmp1[2] = vdupq_n_s8(0);
+    tmp0[0] = b.vect_s8[0];
+    tmp0[1] = a.vect_s8[0];
+    tmp1[0] = b.vect_s8[1];
+    tmp1[1] = a.vect_s8[1];
+    res_m256i.vect_s8[0] = vld1q_s8((int8_t *)tmp0 + shift);
+    res_m256i.vect_s8[1] = vld1q_s8((int8_t *)tmp1 + shift);
+    return res_m256i;
+}
+
+FORCE_INLINE __m256d _mm256_blendv_pd(__m256d a, __m256d b, __m256d mask)
+{
+    __m256d result_m256d;
+    uint64x2_t vect_flag[2];
+    vect_flag[0] = vcgeq_f64(mask.vect_f64[0], vdupq_n_f64(0));
+    vect_flag[1] = vcgeq_f64(mask.vect_f64[1], vdupq_n_f64(0));
+    result_m256d.vect_f64[0] = vbslq_f64(vect_flag[0], a.vect_f64[0], b.vect_f64[0]);
+    result_m256d.vect_f64[1] = vbslq_f64(vect_flag[1], a.vect_f64[1], b.vect_f64[1]);
+    return result_m256d;
+}
+
+FORCE_INLINE __m256 _mm256_blendv_ps(__m256 a, __m256 b, __m256 mask)
+{
+    __m256 result_m256;
+    uint32x4_t vect_flag[2];
+    vect_flag[0] = vcgeq_f32(mask.vect_f32[0], vdupq_n_f32(0));
+    vect_flag[1] = vcgeq_f32(mask.vect_f32[1], vdupq_n_f32(0));
+    result_m256.vect_f32[0] = vbslq_f32(vect_flag[0], a.vect_f32[0], b.vect_f32[0]);
+    result_m256.vect_f32[1] = vbslq_f32(vect_flag[1], a.vect_f32[1], b.vect_f32[1]);
+    return result_m256;
+}
+
+FORCE_INLINE __m256 _mm256_blend_ps(__m256 a, __m256 b, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 255);
+    __m256 result_m256;
+    uint32x4_t vect_mask = vld1q_u32(g_mask_epi32);
+    uint32x4_t vect_imm = vdupq_n_u32(imm8);
+    uint32x4_t flag[2];
+    flag[0] = vtstq_u32(vect_imm, vect_mask);
+    flag[1] = vtstq_u32(vshrq_n_u32(vect_imm, 4), vect_mask);
+    result_m256.vect_f32[0] = vbslq_f32(flag[0], b.vect_f32[0], a.vect_f32[0]);
+    result_m256.vect_f32[1] = vbslq_f32(flag[1], b.vect_f32[1], a.vect_f32[1]);
+    return result_m256;
+} 
+
+FORCE_INLINE __m256d _mm256_blend_pd(__m256d a, __m256d b, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 15);
+    __m256d result_m256d;
+    uint64x2_t vect_mask = vld1q_u64(g_mask_epi64);
+    uint64x2_t vect_imm = vdupq_n_u64(imm8);
+    uint64x2_t flag[2];
+    flag[0] = vtstq_u64(vect_imm, vect_mask);
+    flag[1] = vtstq_u64(vshrq_n_u64(vect_imm, 2), vect_mask);
+    result_m256d.vect_f64[0] = vbslq_f64(flag[0], b.vect_f64[0], a.vect_f64[0]);
+    result_m256d.vect_f64[1] = vbslq_f64(flag[1], b.vect_f64[1], a.vect_f64[1]);
+    return result_m256d;
+}
+
+FORCE_INLINE __m256i _mm256_inserti128_si256(__m256i a, __m128i b, const int imm8)
+{
+    assert(imm8 == 0 || imm8 == 1);
+    __m256i res;
+    uint32x4_t vmask = vceqq_s32(vdupq_n_s32(imm8), vdupq_n_s32(0));
+    res.vect_s32[0] = vbslq_s32(vmask, b.vect_s32, a.vect_s32[0]);
+    res.vect_s32[1] = vbslq_s32(vmask, a.vect_s32[1], b.vect_s32);
+    return res;
+}
+
+FORCE_INLINE __m256d _mm256_insertf128_pd(__m256d a, __m128d b, int imm8)
+{
+    assert(imm8 == 0 || imm8 == 1);
+    __m256d res;
+    uint64x2_t vmask = vceqq_s64(vdupq_n_s64(imm8), vdupq_n_s64(0));
+    res.vect_f64[0] = vbslq_f64(vmask, b, a.vect_f64[0]);
+    res.vect_f64[1] = vbslq_f64(vmask, a.vect_f64[1], b);
+    return res;
+}
+
+FORCE_INLINE __m256 _mm256_insertf128_ps(__m256 a, __m128 b, int imm8)
+{
+    assert(imm8 == 0 || imm8 == 1);
+    __m256 res;
+    uint32x4_t vmask = vceqq_s32(vdupq_n_s32(imm8), vdupq_n_s32(0));
+    res.vect_f32[0] = vbslq_f32(vmask, b, a.vect_f32[0]);
+    res.vect_f32[1] = vbslq_f32(vmask, a.vect_f32[1], b);
+    return res;
+}
+
+FORCE_INLINE __m256i _mm256_insert_epi32 (__m256i a, __int32 i, const int index)
+{
+    assert(index >= 0 && index <= 7);
+    if (index > 3) {
+        a.vect_s32[1] = vsetq_lane_s32(i, a.vect_s32[1], index & 3);
+    } else {
+        a.vect_s32[0] = vsetq_lane_s32(i, a.vect_s32[0], index);
+    }
+    return a;
+}
+
+FORCE_INLINE __m256i _mm256_insert_epi64 (__m256i a, __int64 i, const int index)
+{
+    assert(index >= 0 && index <= 3);
+    if (index > 1) {
+        a.vect_s64[1] = vsetq_lane_s64(i, a.vect_s64[1], index & 1);
+    } else {
+        a.vect_s64[0] = vsetq_lane_s64(i, a.vect_s64[0], index);
+    }
+    return a;
+}
+
+FORCE_INLINE __m256i _mm256_cmpeq_epi32 (__m256i a, __m256i b)
+{
+    __m256i result_m256i;
+    result_m256i.vect_u32[0] = vceqq_s32(a.vect_s32[0], b.vect_s32[0]);
+    result_m256i.vect_u32[1] = vceqq_s32(a.vect_s32[1], b.vect_s32[1]);
+    return result_m256i;
+}
+
+FORCE_INLINE __m256i _mm256_cmpeq_epi8(__m256i a, __m256i b)
+{
+    __m256i result_m256i;
+    result_m256i.vect_u8[0] = vceqq_s8(a.vect_s8[0], b.vect_s8[0]);
+    result_m256i.vect_u8[1] = vceqq_s8(a.vect_s8[1], b.vect_s8[1]);
+    return result_m256i;
+}
+
+typedef uint64x2_t (*TYPE_FUNC_CMP_PD)(__m128d a, __m128d b);
+typedef struct {
+    int opDef;
+    TYPE_FUNC_CMP_PD cmpFun;
+} FuncListCmp256Pd;
+
+static uint64x2_t _cmp_eq_oq(__m128d a, __m128d b)
+{ /* Equal (ordered, non-signaling) */
+    return vceqq_f64(a, b);
+}
+
+static uint64x2_t _cmp_lt_os(__m128d a, __m128d b)
+{ /* Less-than (ordered, signaling)  */
+    return vcltq_f64(a, b);
+}
+
+static uint64x2_t _cmp_le_os(__m128d a, __m128d b)
+{ /* Less-than-or-equal (ordered, signaling)  */
+    return vcleq_f64(a, b);
+}
+
+static uint64x2_t _cmp_unord_q(__m128d a, __m128d b)
+{ /* Unordered (non-signaling)  */
+
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    ptr_r[0] = isunordered(ptr_a[0], ptr_b[0]) ? -1 : 0;
+    ptr_r[1] = isunordered(ptr_a[1], ptr_b[1]) ? -1 : 0;
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_neq_uq(__m128d a, __m128d b)
+{ /* Not-equal (unordered, non-signaling)  */
+    __m128i res;
+    res.vect_u64 = vceqq_f64(a, b);
+    res.vect_u32 = vmvnq_u32(res.vect_u32);
+    return res.vect_u64;
+}
+
+static uint64x2_t _cmp_nlt_us(__m128d a, __m128d b)
+{ /* Not-less-than (unordered, signaling) */
+
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vcgeq_f64(a, b);
+    
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_nle_us(__m128d a, __m128d b)
+{ /* Not-less-than-or-equal (unordered, signaling)  */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vcgtq_f64(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_ord_q(__m128d a, __m128d b)
+{ /* Ordered (nonsignaling) */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    ptr_r[0] = !isunordered(ptr_a[0], ptr_b[0]) ? -1 : 0;
+    ptr_r[1] = !isunordered(ptr_a[1], ptr_b[1]) ? -1 : 0;
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_eq_uq(__m128d a, __m128d b)
+{ /* Equal (unordered, non-signaling) */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vceqq_f64(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_nge_us(__m128d a, __m128d b)
+{ /* Not-greater-than-or-equal (unordered, signaling) */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vcltq_f64(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_ngt_us(__m128d a, __m128d b)
+{ /* Not-greater-than (unordered, signaling)  */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vcleq_f64(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_false_oq(__m128d a, __m128d b)
+{ /* False (ordered, non-signaling)  */
+    (void)a;
+    (void)b;
+
+    return vdupq_n_u64(0);
+}
+
+static uint64x2_t _cmp_neq_oq(__m128d a, __m128d b)
+{ /* Not-equal (ordered, non-signaling)  */
+    __m128i res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i.vect_u64 = vceqq_f64(a, b);
+    res_m128i.vect_u32 = vmvnq_u32(res_m128i.vect_u32);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = 0;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = 0;
+    }
+    return res_m128i.vect_u64;
+}
+
+static uint64x2_t _cmp_ge_os(__m128d a, __m128d b)
+{ /* Greater-than-or-equal (ordered, signaling)  */
+    return vcgeq_f64(a, b);
+}
+
+static uint64x2_t _cmp_gt_os(__m128d a, __m128d b)
+{ /* Greater-than (ordered, signaling)  */
+    return vcgtq_f64(a, b);
+}
+
+static uint64x2_t _cmp_true_uq(__m128d a, __m128d b)
+{ /* True (unordered, non-signaling) */
+    (void)a;
+    (void)b;
+
+    return vdupq_n_u64(-1);
+}
+
+static uint64x2_t _cmp_eq_os(__m128d a, __m128d b)
+{ /* Equal (ordered, signaling)  */
+    return vceqq_f64(a, b);
+}
+
+static uint64x2_t _cmp_lt_oq(__m128d a, __m128d b)
+{ /* Less-than (ordered, non-signaling)  */
+    return vcltq_f64(a, b);
+}
+
+static uint64x2_t _cmp_le_oq(__m128d a, __m128d b)
+{ /* Less-than-or-equal (ordered, non-signaling)  */
+    return vcleq_f64(a, b);
+}
+
+static uint64x2_t _cmp_unord_s(__m128d a, __m128d b)
+{ /* Unordered (signaling)  */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    ptr_r[0] = isunordered(ptr_a[0], ptr_b[0]) ? -1 : 0;
+    ptr_r[1] = isunordered(ptr_a[1], ptr_b[1]) ? -1 : 0;
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_neq_us(__m128d a, __m128d b)
+{ /* Not-equal (unordered, signaling) */
+    __m128i res_m128i;
+    res_m128i.vect_u64 = vceqq_f64(a, b);
+    res_m128i.vect_u32 = vmvnq_u32(res_m128i.vect_u32);
+    return res_m128i.vect_u64;
+}
+
+static uint64x2_t _cmp_nlt_uq(__m128d a, __m128d b)
+{ /* Not-less-than (unordered, non-signaling)*/
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vcgeq_f64(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_nle_uq(__m128d a, __m128d b)
+{ /* Not-less-than-or-equal (unordered, non-signaling) */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vcgtq_f64(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_ord_s(__m128d a, __m128d b)
+{ /* Ordered (signaling)  */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vceqq_f64(a, b);
+    ptr_r[0] = !isunordered(ptr_a[0], ptr_b[0]) ? -1 : 0;
+    ptr_r[1] = !isunordered(ptr_a[1], ptr_b[1]) ? -1 : 0;
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_eq_us(__m128d a, __m128d b)
+{ /* Equal (unordered, signaling)  */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vceqq_f64(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_nge_uq(__m128d a, __m128d b)
+{ /* Not-greater-than-or-equal (unordered, non-signaling)  */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vcltq_f64(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_ngt_uq(__m128d a, __m128d b)
+{ /* Not-greater-than (unordered, non-signaling) */
+    uint64x2_t res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i = vcleq_f64(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    return res_m128i;
+}
+
+static uint64x2_t _cmp_false_os(__m128d a, __m128d b)
+{ /* False (ordered, signaling)  */
+    (void)a;
+    (void)b;
+
+    return vdupq_n_u64(0);
+}
+
+static uint64x2_t _cmp_neq_os(__m128d a, __m128d b)
+{ /* Not-equal (ordered, signaling)  */
+    __m128i res_m128i;
+    float64_t *ptr_a = (float64_t *)&a;
+    float64_t *ptr_b = (float64_t *)&b;
+    uint64_t *ptr_r = (uint64_t *)&res_m128i;
+
+    res_m128i.vect_u64 = vceqq_f64(a, b);
+    res_m128i.vect_u32 = vmvnq_u32(res_m128i.vect_u32);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = 0;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = 0;
+    }
+    return res_m128i.vect_u64;
+}
+
+static uint64x2_t _cmp_ge_oq(__m128d a, __m128d b)
+{ /* Greater-than-or-equal (ordered, non-signaling)  */
+    return vcgeq_f64(a, b);
+}
+
+static uint64x2_t _cmp_gt_oq(__m128d a, __m128d b)
+{ /* Greater-than (ordered, non-signaling)  */
+    return vcgtq_f64(a, b);
+}
+
+static uint64x2_t _cmp_true_us(__m128d a, __m128d b)
+{ /* True (unordered, signaling)  */
+    (void)a;
+    (void)b;
+
+    return vdupq_n_u64(-1);
+}
+
+static FuncListCmp256Pd g_FunListCmp256Pd[] = {
+    {_CMP_EQ_OQ, _cmp_eq_oq},   {_CMP_LT_OS, _cmp_lt_os},   {_CMP_LE_OS, _cmp_le_os},   {_CMP_UNORD_Q, _cmp_unord_q},
+    {_CMP_NEQ_UQ, _cmp_neq_uq}, {_CMP_NLT_US, _cmp_nlt_us}, {_CMP_NLE_US, _cmp_nle_us}, {_CMP_ORD_Q, _cmp_ord_q},
+    {_CMP_EQ_UQ, _cmp_eq_uq},   {_CMP_NGE_US, _cmp_nge_us}, {_CMP_NGT_US, _cmp_ngt_us}, {_CMP_FALSE_OQ, _cmp_false_oq},
+    {_CMP_NEQ_OQ, _cmp_neq_oq}, {_CMP_GE_OS, _cmp_ge_os},   {_CMP_GT_OS, _cmp_gt_os},   {_CMP_TRUE_UQ, _cmp_true_uq},
+    {_CMP_EQ_OS, _cmp_eq_os},   {_CMP_LT_OQ, _cmp_lt_oq},   {_CMP_LE_OQ, _cmp_le_oq},   {_CMP_UNORD_S, _cmp_unord_s},
+    {_CMP_NEQ_US, _cmp_neq_us}, {_CMP_NLT_UQ, _cmp_nlt_uq}, {_CMP_NLE_UQ, _cmp_nle_uq}, {_CMP_ORD_S, _cmp_ord_s},
+    {_CMP_EQ_US, _cmp_eq_us},   {_CMP_NGE_UQ, _cmp_nge_uq}, {_CMP_NGT_UQ, _cmp_ngt_uq}, {_CMP_FALSE_OS, _cmp_false_os},
+    {_CMP_NEQ_OS, _cmp_neq_os}, {_CMP_GE_OQ, _cmp_ge_oq},   {_CMP_GT_OQ, _cmp_gt_oq},   {_CMP_TRUE_US, _cmp_true_us}};
+
+FORCE_INLINE __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int imm8)
+{
+    assert(imm8 < 32 && imm8 >= 0);
+    __m256d dst;
+    dst.vect_f64[0] = (float64x2_t)g_FunListCmp256Pd[imm8].cmpFun(a.vect_f64[0], b.vect_f64[0]);
+    dst.vect_f64[1] = (float64x2_t)g_FunListCmp256Pd[imm8].cmpFun(a.vect_f64[1], b.vect_f64[1]);
+    return dst;
+}
+
+typedef uint32x4_t (*TYPE_FUNC_CMP_PS)(__m128 a, __m128 b);
+typedef struct {
+    int opDef;
+    TYPE_FUNC_CMP_PS cmpFun;
+} FuncListCmp256Ps;
+
+static uint32x4_t _cmp_eq_oq_ps(__m128 a, __m128 b)
+{ /* Equal (ordered, non-signaling) */
+    return vceqq_f32(a, b);
+}
+
+static uint32x4_t _cmp_lt_os_ps(__m128 a, __m128 b)
+{ /* Less-than (ordered, signaling)  */
+    return vcltq_f32(a, b);
+}
+
+static uint32x4_t _cmp_le_os_ps(__m128 a, __m128 b)
+{ /* Less-than-or-equal (ordered, signaling)  */
+    return vcleq_f32(a, b);
+}
+
+static uint32x4_t _cmp_unord_q_ps(__m128 a, __m128 b)
+{ /* Unordered (non-signaling)  */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    ptr_r[0] = isunordered(ptr_a[0], ptr_b[0]) ? -1 : 0;
+    ptr_r[1] = isunordered(ptr_a[1], ptr_b[1]) ? -1 : 0;
+    ptr_r[2] = isunordered(ptr_a[2], ptr_b[2]) ? -1 : 0;
+    ptr_r[3] = isunordered(ptr_a[3], ptr_b[3]) ? -1 : 0;
+    return res;
+}
+
+static uint32x4_t _cmp_neq_uq_ps(__m128 a, __m128 b)
+{ /* Not-equal (unordered, non-signaling)  */
+    uint32x4_t res = vceqq_f32(a, b);
+    return vmvnq_u32(res);
+}
+
+static uint32x4_t _cmp_nlt_us_ps(__m128 a, __m128 b)
+{ /* Not-less-than (unordered, signaling)  */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vcgeq_f32(a, b);
+    
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = -1;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = -1;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_nle_us_ps(__m128 a, __m128 b)
+{ /* Not-less-than-or-equal (unordered, signaling)  */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vcgtq_f32(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = -1;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = -1;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_ord_q_ps(__m128 a, __m128 b)
+{ /* Ordered (nonsignaling) */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    ptr_r[0] = !isunordered(ptr_a[0], ptr_b[0]) ? -1 : 0;
+    ptr_r[1] = !isunordered(ptr_a[1], ptr_b[1]) ? -1 : 0;
+    ptr_r[2] = !isunordered(ptr_a[2], ptr_b[2]) ? -1 : 0;
+    ptr_r[3] = !isunordered(ptr_a[3], ptr_b[3]) ? -1 : 0;
+    return res;
+}
+
+static uint32x4_t _cmp_eq_uq_ps(__m128 a, __m128 b)
+{ /* Equal (unordered, non-signaling) */ 
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vceqq_f32(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = -1;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = -1;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_nge_us_ps(__m128 a, __m128 b)
+{ /* Not-greater-than-or-equal (unordered, signaling) */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vcltq_f32(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = -1;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = -1;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_ngt_us_ps(__m128 a, __m128 b)
+{ /* Not-greater-than (unordered, signaling)  */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vcleq_f32(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = -1;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = -1;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_false_oq_ps(__m128 a, __m128 b)
+{ /* False (ordered, non-signaling)  */
+    (void)a;
+    (void)b;
+    return vdupq_n_u32(0);
+}
+
+static uint32x4_t _cmp_neq_oq_ps(__m128 a, __m128 b)
+{ /* Not-equal (ordered, non-signaling)  */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vceqq_f32(a, b);
+    res = vmvnq_u32(res);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = 0;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = 0;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = 0;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = 0;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_ge_os_ps(__m128 a, __m128 b)
+{ /* Greater-than-or-equal (ordered, signaling)  */
+    return vcgeq_f32(a, b);
+}
+
+static uint32x4_t _cmp_gt_os_ps(__m128 a, __m128 b)
+{ /* Greater-than (ordered, signaling)  */
+    return vcgtq_f32(a, b);
+}
+
+static uint32x4_t _cmp_true_uq_ps(__m128 a, __m128 b)
+{ /* True (unordered, non-signaling) */
+    (void)a;
+    (void)b;
+    return vdupq_n_u32(-1);
+}
+
+static uint32x4_t _cmp_eq_os_ps(__m128 a, __m128 b)
+{ /* Equal (ordered, signaling)  */
+    return vceqq_f32(a, b);
+}
+
+static uint32x4_t _cmp_lt_oq_ps(__m128 a, __m128 b)
+{ /* Less-than (ordered, non-signaling)  */
+    return vcltq_f32(a, b);
+}
+
+static uint32x4_t _cmp_le_oq_ps(__m128 a, __m128 b)
+{ /* Less-than-or-equal (ordered, non-signaling)  */
+    return vcleq_f32(a, b);
+}
+
+static uint32x4_t _cmp_unord_s_ps(__m128 a, __m128 b)
+{ /* Unordered (signaling)  */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    ptr_r[0] = isunordered(ptr_a[0], ptr_b[0]) ? -1 : 0;
+    ptr_r[1] = isunordered(ptr_a[1], ptr_b[1]) ? -1 : 0;
+    ptr_r[2] = isunordered(ptr_a[2], ptr_b[2]) ? -1 : 0;
+    ptr_r[3] = isunordered(ptr_a[3], ptr_b[3]) ? -1 : 0;
+    return res;
+}
+
+static uint32x4_t _cmp_neq_us_ps(__m128 a, __m128 b)
+{ /* Not-equal (unordered, signaling) */
+    uint32x4_t res = vceqq_f32(a, b);
+    return vmvnq_u32(res);
+}
+
+static uint32x4_t _cmp_nlt_uq_ps(__m128 a, __m128 b)
+{ /* Not-less-than (unordered, non-signaling)*/  
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vcgeq_f32(a, b);
+    
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = -1;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = -1;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_nle_uq_ps(__m128 a, __m128 b)
+{ /* Not-less-than-or-equal (unordered, non-signaling) */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vcgtq_f32(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = -1;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = -1;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_ord_s_ps(__m128 a, __m128 b)
+{ /* Ordered (signaling)  */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    ptr_r[0] = !isunordered(ptr_a[0], ptr_b[0]) ? -1 : 0;
+    ptr_r[1] = !isunordered(ptr_a[1], ptr_b[1]) ? -1 : 0;
+    ptr_r[2] = !isunordered(ptr_a[2], ptr_b[2]) ? -1 : 0;
+    ptr_r[3] = !isunordered(ptr_a[3], ptr_b[3]) ? -1 : 0;
+    return res;
+}
+
+static uint32x4_t _cmp_eq_us_ps(__m128 a, __m128 b)
+{ /* Equal (unordered, signaling)  */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vceqq_f32(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = -1;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = -1;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_nge_uq_ps(__m128 a, __m128 b)
+{ /* Not-greater-than-or-equal (unordered, non-signaling)  */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vcltq_f32(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = -1;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = -1;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_ngt_uq_ps(__m128 a, __m128 b)
+{ /* Not-greater-than (unordered, non-signaling) */ 
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vcleq_f32(a, b);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = -1;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = -1;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = -1;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = -1;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_false_os_ps(__m128 a, __m128 b)
+{ /* False (ordered, signaling)  */
+    (void)a;
+    (void)b;
+    return vdupq_n_u32(0);
+}
+
+static uint32x4_t _cmp_neq_os_ps(__m128 a, __m128 b)
+{ /* Not-equal (ordered, signaling)  */
+    uint32x4_t res;
+    float32_t *ptr_a = (float32_t *)&a;
+    float32_t *ptr_b = (float32_t *)&b;
+    uint32_t *ptr_r = (uint32_t *)&res;
+
+    res = vceqq_f32(a, b);
+    res = vmvnq_u32(res);
+    if (isunordered(ptr_a[0], ptr_b[0])) {
+        ptr_r[0] = 0;
+    }
+    if (isunordered(ptr_a[1], ptr_b[1])) {
+        ptr_r[1] = 0;
+    }
+    if (isunordered(ptr_a[2], ptr_b[2])) {
+        ptr_r[2] = 0;
+    }
+    if (isunordered(ptr_a[3], ptr_b[3])) {
+        ptr_r[3] = 0;
+    }
+    return res;
+}
+
+static uint32x4_t _cmp_ge_oq_ps(__m128 a, __m128 b)
+{ /* Greater-than-or-equal (ordered, non-signaling)  */
+    return vcgeq_f32(a, b);
+}
+
+static uint32x4_t _cmp_gt_oq_ps(__m128 a, __m128 b)
+{ /* Greater-than (ordered, non-signaling)  */
+    return vcgtq_f32(a, b);
+}
+
+static uint32x4_t _cmp_true_us_ps(__m128 a, __m128 b)
+{ /* True (unordered, signaling)  */
+    (void)a;
+    (void)b;
+    return vdupq_n_u32(-1);
+}
+
+static FuncListCmp256Ps g_FunListCmp256Ps[] = {
+    {_CMP_EQ_OQ, _cmp_eq_oq_ps},       {_CMP_LT_OS, _cmp_lt_os_ps},     {_CMP_LE_OS, _cmp_le_os_ps},
+    {_CMP_UNORD_Q, _cmp_unord_q_ps},   {_CMP_NEQ_UQ, _cmp_neq_uq_ps},   {_CMP_NLT_US, _cmp_nlt_us_ps},
+    {_CMP_NLE_US, _cmp_nle_us_ps},     {_CMP_ORD_Q, _cmp_ord_q_ps},     {_CMP_EQ_UQ, _cmp_eq_uq_ps},
+    {_CMP_NGE_US, _cmp_nge_us_ps},     {_CMP_NGT_US, _cmp_ngt_us_ps},   {_CMP_FALSE_OQ, _cmp_false_oq_ps},
+    {_CMP_NEQ_OQ, _cmp_neq_oq_ps},     {_CMP_GE_OS, _cmp_ge_os_ps},     {_CMP_GT_OS, _cmp_gt_os_ps},
+    {_CMP_TRUE_UQ, _cmp_true_uq_ps},   {_CMP_EQ_OS, _cmp_eq_os_ps},     {_CMP_LT_OQ, _cmp_lt_oq_ps},
+    {_CMP_LE_OQ, _cmp_le_oq_ps},       {_CMP_UNORD_S, _cmp_unord_s_ps}, {_CMP_NEQ_US, _cmp_neq_us_ps},
+    {_CMP_NLT_UQ, _cmp_nlt_uq_ps},     {_CMP_NLE_UQ, _cmp_nle_uq_ps},   {_CMP_ORD_S, _cmp_ord_s_ps},
+    {_CMP_EQ_US, _cmp_eq_us_ps},       {_CMP_NGE_UQ, _cmp_nge_uq_ps},   {_CMP_NGT_UQ, _cmp_ngt_uq_ps},
+    {_CMP_FALSE_OS, _cmp_false_os_ps}, {_CMP_NEQ_OS, _cmp_neq_os_ps},   {_CMP_GE_OQ, _cmp_ge_oq_ps},
+    {_CMP_GT_OQ, _cmp_gt_oq_ps},       {_CMP_TRUE_US, _cmp_true_us_ps}};
+
+FORCE_INLINE __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int imm8)
+{
+    assert(imm8 < 32 && imm8 >= 0);
+    __m256 dst;
+    dst.vect_f32[0] = vreinterpretq_f32_u32(g_FunListCmp256Ps[imm8].cmpFun(a.vect_f32[0], b.vect_f32[0]));
+    dst.vect_f32[1] = vreinterpretq_f32_u32(g_FunListCmp256Ps[imm8].cmpFun(a.vect_f32[1], b.vect_f32[1]));
+    return dst;
+}
\ No newline at end of file
diff --git a/data/avx2neontestdata.h b/data/avx2neontestdata.h
new file mode 100644
index 0000000..bb4dbee
--- /dev/null
+++ b/data/avx2neontestdata.h
@@ -0,0 +1,2935 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved.
+ * Description: avx2neon test data file
+ * Author: guotaowei
+ * Create: 2019-11-20
+ */
+
+#ifndef AVX2NEONTESTDATA_H
+#define AVX2NEONTESTDATA_H
+
+#include <arm_neon.h>
+
+// 一个结构体对应一个接口测试数据模型
+// 每个接口生成一组测试数据，供测试demo使用
+// expect位接口返回值，其余为接口输入参数数据
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    int8_t expect[32];
+} test_mm256_div_epi8_data_model;
+static test_mm256_div_epi8_data_model g_test_mm256_div_epi8_data = {
+    {-2, 30, 127, 100, 4,   8, 10, -43, -56, 102, 120, 70, 45, -12, 20, 27,
+     1,  -4, 18,  50,  -49, 7, 0,  80,  8,   6,   -7,  15, 0,  9,   11, 6},
+    {1,  12,  -5, 25, 2,  10, -1, -12, -6, 102, 12, 100, -45, 2, 80, 127,
+     11, -42, 8,  5,  -7, 17, 20, 10,  9,  2,   12, 3,   1,   3, 4,  2},
+    {-2, 2, -25, 4, 2, 0, -10, 3, 9, 1, 10, 0, -1, -6, 0, 0, 0, 0, 2, 10, 7, 0, 0, 8, 0, 3, 0, 5, 0, 3, 2, 3}};
+
+typedef struct {
+    int16_t a[16];
+    int16_t b[16];
+    int16_t expect[16];
+} test_mm256_div_epi16_data_model;
+static test_mm256_div_epi16_data_model g_test_mm256_div_epi16_data = {
+    {-2, 30, 127, 100, 4, 8, 10, -43, -56, 102, 120, 70, 45, -12, 20, 27},
+    {1, 12, -5, 25, 2, 10, -1, -12, -6, 102, 12, 11, -45, 2, 80, 127},
+    {-2, 2, -25, 4, 2, 0, -10, 3, 9, 1, 10, 6, -1, -6, 0, 0}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[8];
+    int32_t expect[8];
+} test_mm256_div_epi32_data_model;
+static test_mm256_div_epi32_data_model g_test_mm256_div_epi32_data = {
+    {12, 30, 127, 100, 4, 8, 10, -43}, {-4, 12, -5, 25, 2, 10, -1, -12}, {-3, 2, -25, 4, 2, 0, -10, 3}};
+
+typedef struct {
+    int64_t a[4];
+    int64_t b[4];
+    int64_t expect[4];
+} test_mm256_div_epi64_data_model;
+static test_mm256_div_epi64_data_model g_test_mm256_div_epi64_data = {
+    {-2, 30, 127, 100}, {1, 12, -5, 25}, {-2, 2, -25, 4}};
+
+typedef struct {
+    uint8_t a[32];
+    uint8_t b[32];
+    uint8_t expect[32];
+} test_mm256_div_epu8_data_model;
+static test_mm256_div_epu8_data_model g_test_mm256_div_epu8_data = {
+    {2, 30, 127, 100, 4,  8, 10, 43, 56, 102, 120, 70, 45, 12, 20, 27,
+     1, 4,  18,  50,  49, 7, 0,  80, 8,  6,   7,   15, 0,  9,  11, 6},
+    {1, 12, 5, 25, 2, 10, 1, 12, 6, 102, 12, 20, 45, 2, 80, 127, 11, 42, 8, 5, 7, 17, 20, 10, 9, 2, 12, 3, 1, 3, 4, 2},
+    {2, 2, 25, 4, 2, 0, 10, 3, 9, 1, 10, 3, 1, 6, 0, 0, 0, 0, 2, 10, 7, 0, 0, 8, 0, 3, 0, 5, 0, 3, 2, 3}};
+
+typedef struct {
+    uint16_t a[16];
+    uint16_t b[16];
+    uint16_t expect[16];
+} test_mm256_div_epu16_data_model;
+static test_mm256_div_epu16_data_model g_test_mm256_div_epu16_data = {
+    {2, 30, 127, 100, 4, 8, 10, 43, 56, 102, 120, 70, 45, 12, 20, 27},
+    {1, 12, 5, 25, 2, 10, 1, 12, 6, 102, 12, 100, 45, 2, 80, 127},
+    {2, 2, 25, 4, 2, 0, 10, 3, 9, 1, 10, 0, 1, 6, 0, 0}};
+
+typedef struct {
+    uint32_t a[8];
+    uint32_t b[8];
+    uint32_t expect[8];
+} test_mm256_div_epu32_data_model;
+static test_mm256_div_epu32_data_model g_test_mm256_div_epu32_data = {
+    {2, 30, 127, 100, 4, 8, 10, 43}, {1, 12, 5, 25, 2, 10, 1, 12}, {2, 2, 25, 4, 2, 0, 10, 3}};
+
+typedef struct {
+    uint64_t a[4];
+    uint64_t b[4];
+    uint64_t expect[4];
+} test_mm256_div_epu64_data_model;
+static test_mm256_div_epu64_data_model g_test_mm256_div_epu64_data = {{2, 30, 127, 100}, {1, 12, 5, 25}, {2, 2, 25, 4}};
+
+typedef struct {
+    float32_t a[8];
+    float32_t b[8];
+    float32_t expect[8];
+} test_mm256_div_ps_data_model;
+static test_mm256_div_ps_data_model g_test_mm256_div_ps_data = {
+    {2.4, 3.8, -5.0, 100, -20.4, 74.6, -50.8, 10.0},
+    {1.2, -2.0, 1.0, -102, 10.1, 12.4, -32.0, 9.8},
+    {2.000000, -1.900000, -5.000000, -0.980392, -2.019802, 6.016129, 1.587500, 1.020408}};
+
+typedef struct {
+    float64_t a[4];
+    float64_t b[4];
+    float64_t expect[4];
+} test_mm256_div_pd_data_model;
+static test_mm256_div_pd_data_model g_test_mm256_div_pd_data = {
+    {2.4, -3.8, 5.0, -10.0}, {1.2, 2.2, -1.0, -9.8}, {2.00, -1.727272727, -5.00, 1.0204081632}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    int8_t expect[64];
+} test_mm512_div_epi8_data_model;
+static test_mm512_div_epi8_data_model g_test_mm512_div_epi8_data = {
+    {47,   25,  -22,  -58,  -99,  -61, 78,   -119, -15,  94,   44, -109, -49, 19,  -102, -13,
+     23,   77,  -111, 83,   -118, -16, -100, -89,  66,   -43,  58, 47,   9,   47,  41,   42,
+     -122, -25, -26,  -95,  -72,  -47, -92,  119,  -115, 85,   6,  65,   -89, -43, 92,   -100,
+     -59,  96,  74,   -117, 65,   -28, 28,   91,   -100, -100, 59, -71,  -5,  126, 110,  -22},
+    {19,  101, -53, -1,  29,  -94, -14, 2,   84,   -88,  100, -72, 96, 87,   77,  112, 68,  1,    91,  53, 43, -95,
+     81,  -70, 47,  -31, -82, 14,  107, -35, -123, -104, -74, 6,   50, -84,  -83, -14, 36,  -33,  -61, 47, 46, -46,
+     -66, -83, -71, -68, -99, 41,  33,  -24, -4,   -20,  65,  -66, 87, -118, -41, 20,  -52, -113, 53,  -10},
+    {2,  0, 0,  58, -3, 0, -5, -59, 0,   -1, 0, 1,  0,  0, -1, 0,  0,  77, -1, 1, -2, 0,
+     -1, 1, 1,  1,  0,  3, 0,  -1,  0,   0,  1, -4, 0,  1, 0,  3,  -2, -3, 1,  1, 0,  -1,
+     1,  0, -1, 1,  0,  2, 2,  4,   -16, 1,  0, -1, -1, 0, -1, -3, 0,  -1, 2,  2}};
+
+typedef struct {
+    int16_t a[32];
+    int16_t b[32];
+    int16_t expect[32];
+} test_mm512_div_epi16_data_model;
+static test_mm512_div_epi16_data_model g_test_mm512_div_epi16_data = {
+    {-27764, 18989, 14860,  -1117,  23487,  -31492, -20299, -22863, -9384,  -12704, -17101,
+     22888,  -1328, 27043,  -30853, -22633, 7232,   -14790, 8533,   9370,   7752,   -31239,
+     -19337, 3071,  -19607, 4886,   -9456,  -14056, 27609,  12235,  -31911, 27806},
+    {-32554, 3007,  -16040, -26156, 21074, -28633, -3452, 9807,   14548, 26920,  -8623,
+     -29090, 31491, 14012,  -31568, 16812, 20640,  6621,  -14516, -5658, -20876, 29586,
+     28550,  -3293, -29498, -5530,  -7081, -3644,  23668, -24271, 23964, 10621},
+    {0, 6, 0, 0, 1, 1, 5, -2, 0, 0, 1, 0, 0, 1, 0, -1, 0, -2, 0, -1, 0, -1, 0, 0, 0, 0, 1, 3, 1, 0, -1, 2}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_div_epi32_data_model;
+static test_mm512_div_epi32_data_model g_test_mm512_div_epi32_data = {
+    {
+        -86526927,
+        -1363665926,
+        1901063859,
+        -696994376,
+        1151417131,
+        -1397391427,
+        -522987637,
+        -859938203,
+        -1198042314,
+        1176091325,
+        -678966908,
+        -1847149302,
+        -1326453694,
+        -1650350271,
+        1339153969,
+        1389347723,
+    },
+    {465704244, 1712656224, 1849036094, 867915268, -1757225367, -347662472, -1552853338, -1121834203, -23513990,
+     -1632940468, 378479997, -415921536, -1390144286, 2115890216, -732968474, 678529282},
+    {0, 0, 1, 0, 0, 4, 0, 0, 50, 0, -1, 4, 0, 0, -1, 2}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[8];
+    int64_t expect[8];
+} test_mm512_div_epi64_data_model;
+static test_mm512_div_epi64_data_model g_test_mm512_div_epi64_data = {
+    {-2, 30, 127, 100, 90, 33, 7, 12}, {1, 12, -5, 25, 10, 11, -2, 6}, {-2, 2, -25, 4, 9, 3, -3, 2}};
+
+typedef struct {
+    uint8_t a[64];
+    uint8_t b[64];
+    uint8_t expect[64];
+} test_mm512_div_epu8_data_model;
+static test_mm512_div_epu8_data_model g_test_mm512_div_epu8_data = {
+    {55,  74,  185, 187, 43,  229, 136, 86,  31, 175, 148, 70,  120, 66,  122, 52,  205, 54,  88,  205, 200, 7,
+     212, 103, 125, 176, 70,  154, 41,  23,  16, 252, 136, 94,  145, 66,  59,  204, 41,  138, 220, 233, 33,  148,
+     205, 221, 159, 26,  214, 88,  217, 160, 28, 122, 50,  103, 11,  125, 245, 84,  61,  43,  234, 254},
+    {95,  201, 161, 7,   134, 20, 214, 181, 246, 206, 117, 16,  240, 222, 197, 42,  65,  159, 63,  111, 182, 229,
+     154, 149, 179, 108, 80,  17, 167, 5,   141, 178, 138, 62,  212, 16,  179, 103, 176, 232, 169, 23,  80,  176,
+     189, 164, 237, 216, 140, 53, 65,  160, 197, 72,  121, 105, 194, 244, 118, 241, 219, 47,  48,  216},
+    {0, 0, 1, 26, 0, 11, 0, 0, 0, 0,  1, 4, 0, 0, 0, 1, 3, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 9, 0, 4, 0, 1,
+     0, 1, 0, 4,  0, 1,  0, 0, 1, 10, 0, 0, 1, 1, 0, 0, 1, 1, 3, 1, 0, 1, 0, 0, 0, 0, 2, 0, 0, 0, 4, 1}};
+
+typedef struct {
+    uint16_t a[32];
+    uint16_t b[32];
+    uint16_t expect[32];
+} test_mm512_div_epu16_data_model;
+static test_mm512_div_epu16_data_model g_test_mm512_div_epu16_data = {
+    {6093,  42861, 16601, 30643, 658,   60923, 18131, 31926, 25828, 6984,  42686, 2740, 32879, 49179, 17320, 7494,
+     27197, 2932,  11855, 48682, 55996, 601,   39589, 28778, 14142, 59953, 38450, 6003, 39710, 55349, 25634, 58112},
+    {34515, 52323, 9067,  33044, 63197, 2487,  42557, 35014, 47494, 48556, 1715, 27920, 39813, 63644, 5287, 61489,
+     50722, 33691, 29654, 40504, 20180, 64930, 36671, 20106, 13733, 42870, 2218, 60682, 964,   3909,  986,  42920},
+    {0, 0, 1, 0, 0, 24, 0, 0, 0, 0, 24, 0, 0, 0, 3, 0, 0, 0, 0, 1, 2, 0, 1, 1, 1, 1, 17, 0, 41, 14, 25, 1}};
+
+typedef struct {
+    uint32_t a[16];
+    uint32_t b[16];
+    uint32_t expect[16];
+} test_mm512_div_epu32_data_model;
+static test_mm512_div_epu32_data_model g_test_mm512_div_epu32_data = {
+    {133695101, 2255378798, 1666553170, 2288411503, 3229563082, 84913098, 1798373403, 90246357, 2256724188, 253413769,
+     1961992073, 3517283845, 3832301571, 682532875, 2758293005, 3097792386},
+    {3402735806, 1326242498, 3914365261, 4170459018, 3667759151, 1272899006, 1635724039, 159383538, 3790403195,
+     2352788476, 2513199794, 2514630521, 892080090, 2927324231, 3167767204, 1540924681},
+    {0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 4, 0, 0, 2}};
+
+typedef struct {
+    uint64_t a[8];
+    uint64_t b[8];
+    uint64_t expect[8];
+} test_mm512_div_epu64_data_model;
+static test_mm512_div_epu64_data_model g_test_mm512_div_epu64_data = {
+    {2, 30, 127, 100, 99, 34, 12, 10}, {1, 12, 5, 25, 9, 11, 2, 5}, {2, 2, 25, 4, 11, 3, 6, 2}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+} test_mm512_div_ps_data_model;
+static test_mm512_div_ps_data_model g_test_mm512_div_ps_data = {
+    {2.4, 3.8, -5.0, 100, -20.4, 74.6, -50.8, 10.0, 40.658, 14.65, 49, 120, 1.5, 0.0024, 0.3, -4.2},
+    {1.2, -2.0, 1.0, -102, 10.1, 12.4, -32.0, 9.8, 3, 0.2, -0.5, 7.4, 30, 0.005, -4, 2.1},
+    {2.0, -1.9, -5.0, -0.98039, -2.01980, 6.01613, 1.58750, 1.02041, 13.55267, 73.25, -98, 16.21622, 0.05, 0.48, -0.075,
+     -2}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    float64_t expect[8];
+} test_mm512_div_pd_data_model;
+static test_mm512_div_pd_data_model g_test_mm512_div_pd_data = {
+    {2.4, -3.8, 5.0, -10.0, -20.4, 74.6, -50.8, 10.0},
+    {1.2, 2.2, -1.0, -9.8, 10.1, 12.4, -32.0, 9.8},
+    {2.00, -1.727272727, -5.00, 1.0204081632, -2.01980198, 6.01612903, 1.587500, 1.02040816}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    int rounding;
+    float32_t expect[16];
+} test_mm512_div_round_ps_data_model;
+static test_mm512_div_round_ps_data_model g_test_mm512_div_round_ps_data = {
+    {2.4, 3.8, -5.0, 100, -20.4, 74.6, -50.8, 10.0, 40.658, 14.65, 49, 120, 1.5, 0.0024, 0.3, -4.2},
+    {1.2, -2.0, 1.0, -102, 10.1, 12.4, -32.0, 9.8, 3, 0.2, -0.5, 7.4, 30, 0.005, -4, 2.1},
+    _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC,
+    {2.0, -1.9, -5.0, -0.98039, -2.01980, 6.01613, 1.58750, 1.02041, 13.55267, 73.25, -98, 16.21622, 0.05, 0.48, -0.075,
+     -2}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    int rounding;
+    float64_t expect[8];
+} test_mm512_div_round_pd_data_model;
+static test_mm512_div_round_pd_data_model g_test_mm512_div_round_pd_data = {
+    {2.4, -3.8, 5.0, -10.0, -20.4, 74.6, -50.8, 10.0},
+    {1.2, 2.2, -1.0, -9.8, 10.1, 12.4, -32.0, 9.8},
+    _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC,
+    {2.00, -1.727272727, -5.00, 1.0204081632, -2.01980198, 6.01612903, 1.587500, 1.02040816}};
+
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    int8_t expect[32];
+} test_mm256_add_epi8_data_model;
+static test_mm256_add_epi8_data_model g_test_mm256_add_epi8_data = {
+    {107, -45, 58, -11, 46, 84,  2,  91, 38,   -92, 6,  -5,  -33, 14, -96,  -19,
+     90,  5,   52, 45,  70, -44, 45, 24, -109, 30,  44, 108, -17, 22, -122, 27},
+    {-49, -7, 115, -103, 76, 114, -84, -49, 99, -54, 112,  -46,  -43, 10, 11,   -116,
+     65,  20, 122, -30,  1,  -55, -80, -45, 60, 25,  -103, -116, -6,  15, -125, 96},
+    {58,   -52, -83, -114, 122, -58, -82, 42,  -119, 110, 118, -51, -76, 24, -85, 121,
+     -101, 25,  -82, 15,   71,  -99, -35, -21, -49,  55,  -59, -8,  -23, 37, 9,   123}};
+
+typedef struct {
+    int16_t a[16];
+    int16_t b[16];
+    int16_t expect[16];
+} test_mm256_add_epi16_data_model;
+static test_mm256_add_epi16_data_model g_test_mm256_add_epi16_data = {
+    {-5864, -18716, 7272, 20419, -28335, -3286, -19673, -26955, 13073, 21177, -18843, 22508, 11120, -17786, 18071,
+     13963},
+    {12271, 2499, 30699, -5904, 22925, 28211, -5074, 14008, 1172, 9077, 7403, 32476, 29133, 8618, 20764, -20560},
+    {6407, -16217, -27565, 14515, -5410, 24925, -24747, -12947, 14245, 30254, -11440, -10552, -25283, -9168, -26701,
+     -6597}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[8];
+    int32_t expect[8];
+} test_mm256_add_epi32_data_model;
+static test_mm256_add_epi32_data_model g_test_mm256_add_epi32_data = {
+    {1, 16516, -21313, -545454, -2147483648, 313213, -1695550175, -486101411},
+    {1, 321321, -113132, 1321564, 2147483647, -13213211, 25646321, -1277938685},
+    {2, 337837, -134445, 776110, -1, -12899998, -1669903854, -1764040096}};
+
+typedef struct {
+    int64_t a[4];
+    int64_t b[4];
+    int64_t expect[4];
+} test_mm256_add_epi64_data_model;
+static test_mm256_add_epi64_data_model g_test_mm256_add_epi64_data = {
+    {1, 43532132, 9223372036854775806, -9223372036854775807},
+    {1, 13213143, 1, 9223372036854775807},
+    {2, 56745275, 9223372036854775807, 0}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    int8_t expect[64];
+} test_mm512_add_epi8_data_model;
+static test_mm512_add_epi8_data_model g_test_mm512_add_epi8_data = {
+    {107, -45, 58,   -11, 46, 84,  2,   91, 38,   -92, 6,   -5,  -33,  14,  -96, -19, 90,  5,  52,   45,  70, -44,
+     45,  24,  -109, 30,  44, 108, -17, 22, -122, 27,  107, -45, 58,   -11, 46,  84,  2,   91, 38,   -92, 6,  -5,
+     -33, 14,  -96,  -19, 90, 5,   52,  45, 70,   -44, 45,  24,  -109, 30,  44,  108, -17, 22, -122, 27},
+    {-49, -7, 115, -103, 76, 114, -84, -49, 99, -54, 112,  -46,  -43, 10, 11,   -116,
+     65,  20, 122, -30,  1,  -55, -80, -45, 60, 25,  -103, -116, -6,  15, -125, 96,
+     -49, -7, 115, -103, 76, 114, -84, -49, 99, -54, 112,  -46,  -43, 10, 11,   -116,
+     65,  20, 122, -30,  1,  -55, -80, -45, 60, 25,  -103, -116, -6,  15, -125, 96},
+    {58,  -52, -83, -114, 122,  -58, -82, 42, -119, 110, 118, -51, -76, 24,   -85, 121, -101, 25, -82,  15,  71,  -99,
+     -35, -21, -49, 55,   -59,  -8,  -23, 37, 9,    123, 58,  -52, -83, -114, 122, -58, -82,  42, -119, 110, 118, -51,
+     -76, 24,  -85, 121,  -101, 25,  -82, 15, 71,   -99, -35, -21, -49, 55,   -59, -8,  -23,  37, 9,    123}};
+
+typedef struct {
+    int16_t a[32];
+    int16_t b[32];
+    int16_t expect[32];
+} test_mm512_add_epi16_data_model;
+static test_mm512_add_epi16_data_model g_test_mm512_add_epi16_data = {
+    {-5864,  -18716, 7272,   20419, -28335, -3286, -19673, -26955, 13073, 21177,  -18843,
+     22508,  11120,  -17786, 18071, 13963,  -5864, -18716, 7272,   20419, -28335, -3286,
+     -19673, -26955, 13073,  21177, -18843, 22508, 11120,  -17786, 18071, 13963},
+    {12271, 2499, 30699, -5904, 22925, 28211, -5074, 14008, 1172, 9077, 7403, 32476, 29133, 8618, 20764, -20560,
+     12271, 2499, 30699, -5904, 22925, 28211, -5074, 14008, 1172, 9077, 7403, 32476, 29133, 8618, 20764, -20560},
+    {6407,   -16217, -27565, 14515,  -5410,  24925,  -24747, -12947, 14245,  30254, -11440,
+     -10552, -25283, -9168,  -26701, -6597,  6407,   -16217, -27565, 14515,  -5410, 24925,
+     -24747, -12947, 14245,  30254,  -11440, -10552, -25283, -9168,  -26701, -6597}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_add_epi32_data_model;
+static test_mm512_add_epi32_data_model g_test_mm512_add_epi32_data = {
+    {1, 16516, -21313, -545454, -2147483648, 313213, -1695550175, -486101411, 1, 16516, -21313, -545454, -2147483648,
+     313213, -1695550175, -486101411},
+    {1, 321321, -113132, 1321564, 2147483647, -13213211, 25646321, -1277938685, 1, 321321, -113132, 1321564, 2147483647,
+     -13213211, 25646321, -1277938685},
+    {2, 337837, -134445, 776110, -1, -12899998, -1669903854, -1764040096, 2, 337837, -134445, 776110, -1, -12899998,
+     -1669903854, -1764040096}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[8];
+    int64_t expect[8];
+} test_mm512_add_epi64_data_model;
+static test_mm512_add_epi64_data_model g_test_mm512_add_epi64_data = {
+    {1, 43532132, 9223372036854775806, -9223372036854775807, 1, 43532132, 9223372036854775805, -9223372036854775807},
+    {1, 13213143, 1, 9223372036854775807, 1, 13213143, 2, 9223372036854775806},
+    {2, 56745275, 9223372036854775807, 0, 2, 56745275, 9223372036854775807, -1}};
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    int8_t expect[32];
+} test_mm256_adds_epi8_data_model;
+static test_mm256_adds_epi8_data_model g_test_mm256_adds_epi8_data = {
+    {107, -45, 58, -11, 46, 84,  2,  91, 38,   -92, 6,  -5,  -33, 14, -96,  -19,
+     90,  5,   52, 45,  70, -44, 45, 24, -109, 30,  44, 108, -17, 22, -122, 27},
+    {39, -99, 115, -103, 76, 114, -84, -49, 99, -54, 112,  -46,  -43, 10, 11,   -116,
+     65, 20,  122, -30,  1,  -55, -80, -45, 60, 25,  -103, -116, -6,  15, -125, 96},
+    {127, -128, 127, -114, 122, 127, -82, 42,  127, -128, 118, -51, -76, 24, -85,  -128,
+     127, 25,   127, 15,   71,  -99, -35, -21, -49, 55,   -59, -8,  -23, 37, -128, 123}};
+
+typedef struct {
+    int16_t a[16];
+    int16_t b[16];
+    int16_t expect[16];
+} test_mm256_adds_epi16_data_model;
+static test_mm256_adds_epi16_data_model g_test_mm256_adds_epi16_data = {
+    {-5864, -18716, 30111, -31000, -28335, 28333, -19673, -26955, 13073, 21177, -18843, 22508, 11120, -17786, 18071,
+     13963},
+    {12271, 2499, 30699, -5904, 22925, 28211, -19333, 14008, 1172, 9077, 7403, 32476, 29133, 8618, 20764, -20560},
+    {6407, -16217, 32767, -32768, -5410, 32767, -32768, -12947, 14245, 30254, -11440, 32767, 32767, -9168, 32767,
+     -6597}};
+
+typedef struct {
+    uint8_t a[32];
+    uint8_t b[32];
+    uint8_t expect[32];
+} test_mm256_adds_epu8_data_model;
+static test_mm256_adds_epu8_data_model g_test_mm256_adds_epu8_data = {
+    {246, 10,  160, 160, 47,  243, 45,  128, 76,  158, 44, 44, 26, 145, 210, 76,
+     204, 170, 115, 91,  104, 15,  232, 66,  157, 109, 32, 7,  93, 48,  152, 248},
+    {203, 225, 28, 239, 30, 109, 154, 172, 13,  57,  63,  61, 195, 246, 0,   201,
+     86,  108, 74, 162, 78, 150, 143, 52,  225, 201, 154, 58, 152, 47,  124, 100},
+    {255, 235, 188, 255, 77,  255, 199, 255, 89,  215, 107, 105, 221, 255, 210, 255,
+     255, 255, 189, 253, 182, 165, 255, 118, 255, 255, 186, 65,  245, 95,  255, 255}};
+
+typedef struct {
+    uint16_t a[16];
+    uint16_t b[16];
+    uint16_t expect[16];
+} test_mm256_adds_epu16_data_model;
+static test_mm256_adds_epu16_data_model g_test_mm256_adds_epu16_data = {
+    {2505, 5546, 31194, 28975, 21936, 58967, 32346, 29341, 57603, 29201, 11167, 1633, 24196, 42551, 30133, 61941},
+    {19524, 29921, 17306, 22881, 26864, 47618, 26529, 27503, 45461, 14653, 46631, 34091, 23328, 42737, 36807, 32638},
+    {22029, 35467, 48500, 51856, 48800, 65535, 58875, 56844, 65535, 43854, 57798, 35724, 47524, 65535, 65535, 65535}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    int8_t expect[64];
+} test_mm512_adds_epi8_data_model;
+static test_mm512_adds_epi8_data_model g_test_mm512_adds_epi8_data = {
+    {107, -45, 58,   -11, 46, 84,  2,   91, 38,   -92, 6,   -5,  -33,  14,  -96, -19, 90,  5,  52,   45,  70, -44,
+     45,  24,  -109, 30,  44, 108, -17, 22, -122, 27,  107, -45, 58,   -11, 46,  84,  2,   91, 38,   -92, 6,  -5,
+     -33, 14,  -96,  -19, 90, 5,   52,  45, 70,   -44, 45,  24,  -109, 30,  44,  108, -17, 22, -122, 27},
+    {39, -99, 115, -103, 76, 114, -84, -49, 99, -54, 112,  -46,  -43, 10, 11,   -116,
+     65, 20,  122, -30,  1,  -55, -80, -45, 60, 25,  -103, -116, -6,  15, -125, 96,
+     39, -99, 115, -103, 76, 114, -84, -49, 99, -54, 112,  -46,  -43, 10, 11,   -116,
+     65, 20,  122, -30,  1,  -55, -80, -45, 60, 25,  -103, -116, -6,  15, -125, 96},
+    {127, -128, 127, -114, 122, 127, -82, 42,  127, -128, 118, -51, -76, 24, -85,  -128,
+     127, 25,   127, 15,   71,  -99, -35, -21, -49, 55,   -59, -8,  -23, 37, -128, 123,
+     127, -128, 127, -114, 122, 127, -82, 42,  127, -128, 118, -51, -76, 24, -85,  -128,
+     127, 25,   127, 15,   71,  -99, -35, -21, -49, 55,   -59, -8,  -23, 37, -128, 123}};
+
+typedef struct {
+    int16_t a[32];
+    int16_t b[32];
+    int16_t expect[32];
+} test_mm512_adds_epi16_data_model;
+static test_mm512_adds_epi16_data_model g_test_mm512_adds_epi16_data = {
+    {-5864,  -18716, 30111,  -31000, -28335, 28333, -19673, -26955, 13073,  21177,  -18843,
+     22508,  11120,  -17786, 18071,  13963,  -5864, -18716, 30111,  -31000, -28335, 28333,
+     -19673, -26955, 13073,  21177,  -18843, 22508, 11120,  -17786, 18071,  13963},
+    {12271, 2499, 30699, -5904, 22925, 28211, -19333, 14008, 1172, 9077, 7403, 32476, 29133, 8618, 20764, -20560,
+     12271, 2499, 30699, -5904, 22925, 28211, -19333, 14008, 1172, 9077, 7403, 32476, 29133, 8618, 20764, -20560},
+    {6407,   -16217, 32767, -32768, -5410,  32767, -32768, -12947, 14245,  30254, -11440,
+     32767,  32767,  -9168, 32767,  -6597,  6407,  -16217, 32767,  -32768, -5410, 32767,
+     -32768, -12947, 14245, 30254,  -11440, 32767, 32767,  -9168,  32767,  -6597}};
+
+typedef struct {
+    uint8_t a[64];
+    uint8_t b[64];
+    uint8_t expect[64];
+} test_mm512_adds_epu8_data_model;
+static test_mm512_adds_epu8_data_model g_test_mm512_adds_epu8_data = {
+    {246, 10, 160, 160, 47,  243, 45,  128, 76,  158, 44,  44,  26,  145, 210, 76,  204, 170, 115, 91,  104, 15,
+     232, 66, 157, 109, 32,  7,   93,  48,  152, 248, 211, 209, 236, 232, 123, 235, 134, 152, 14,  237, 159, 105,
+     140, 44, 22,  137, 199, 47,  134, 8,   14,  101, 33,  184, 2,   94,  153, 158, 98,  74,  239, 98},
+    {203, 225, 28,  239, 30,  109, 154, 172, 13,  57,  63,  61, 195, 246, 0,   201, 86,  108, 74,  162, 78,  150,
+     143, 52,  225, 201, 154, 58,  152, 47,  124, 100, 162, 70, 44,  85,  247, 99,  45,  35,  5,   47,  244, 252,
+     154, 36,  36,  233, 90,  179, 23,  1,   243, 149, 253, 47, 165, 162, 200, 37,  202, 120, 211, 182},
+    {255, 235, 188, 255, 77,  255, 199, 255, 89,  215, 107, 105, 221, 255, 210, 255, 255, 255, 189, 253, 182, 165,
+     255, 118, 255, 255, 186, 65,  245, 95,  255, 255, 255, 255, 255, 255, 255, 255, 179, 187, 19,  255, 255, 255,
+     255, 80,  58,  255, 255, 226, 157, 9,   255, 250, 255, 231, 167, 255, 255, 195, 255, 194, 255, 255}};
+
+typedef struct {
+    uint16_t a[32];
+    uint16_t b[32];
+    uint16_t expect[32];
+} test_mm512_adds_epu16_data_model;
+static test_mm512_adds_epu16_data_model g_test_mm512_adds_epu16_data = {
+    {2505,  5546,  31194, 28975, 21936, 58967, 32346, 29341, 57603, 29201, 11167, 1633,  24196, 42551, 30133, 61941,
+     56332, 62560, 19258, 44871, 52863, 23277, 64832, 47177, 32544, 6016,  57474, 12266, 30978, 58871, 26089, 28214},
+    {19524, 29921, 17306, 22881, 26864, 47618, 26529, 27503, 45461, 14653, 46631, 34091, 23328, 42737, 36807, 32638,
+     1951,  21990, 25999, 41194, 38302, 19674, 52618, 56899, 10842, 43711, 7650,  16134, 54818, 61112, 55276, 16885},
+    {22029, 35467, 48500, 51856, 48800, 65535, 58875, 56844, 65535, 43854, 57798, 35724, 47524, 65535, 65535, 65535,
+     58283, 65535, 45257, 65535, 65535, 42951, 65535, 65535, 43386, 49727, 65124, 28400, 65535, 65535, 65535, 45099}};
+
+typedef struct {
+    float32_t a[8];
+    float32_t b[8];
+    float32_t expect[8];
+} test_mm256_add_ps_data_model;
+static test_mm256_add_ps_data_model g_test_mm256_add_ps_data = {
+    {55720.105469, 64281.750000, 89742.835938, 73413.921875, 62127.492188, 89540.796875, 91808.859375, 18845.945312},
+    {51026.980469, 74192.773438, 31054.048828, 40471.296875, 10455.238281, 76734.781250, 35246.054688, 66186.554688},
+    {106747.085938, 138474.531250, 120796.882812, 113885.218750, 72582.734375, 166275.578125, 127054.914062,
+     85032.500000}};
+
+typedef struct {
+    float64_t a[4];
+    float64_t b[4];
+    float64_t expect[4];
+} test_mm256_add_pd_data_model;
+static test_mm256_add_pd_data_model g_test_mm256_add_pd_data = {
+    {895828870.5853160620, 13198299.6345069464, -46353094.4883757681, 1008132723.0642696619},
+    {72817530.7555908561, -86737498.6360623688, 463481696.5422678590, 1548820508.3372950554},
+    {968646401.3409068584, -73539199.0015554279, 417128602.0538920760, 2556953231.4015645981}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+} test_mm512_add_ps_data_model;
+static test_mm512_add_ps_data_model g_test_mm512_add_ps_data = {
+    {55720.105469, 64281.750000, 89742.835938, 73413.921875, 62127.492188, 89540.796875, 91808.859375, 18845.945312,
+     35417.636719, 12672.005859, 48391.230469, 99421.414062, 26799.482422, 18733.376953, 87186.093750, 14267.460938},
+    {51026.980469, 74192.773438, 31054.048828, 40471.296875, 10455.238281, 76734.781250, 35246.054688, 66186.554688,
+     8255.891602, 5566.725098, 77149.703125, 96017.484375, 64436.101562, 87067.187500, 93427.578125, 42545.218750},
+    {106747.085938, 138474.531250, 120796.882812, 113885.218750, 72582.734375, 166275.578125, 127054.914062,
+     85032.500000, 43673.527344, 18238.730469, 125540.937500, 195438.906250, 91235.585938, 105800.562500, 180613.671875,
+     56812.679688}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    float64_t expect[8];
+} test_mm512_add_pd_data_model;
+static test_mm512_add_pd_data_model g_test_mm512_add_pd_data = {
+    {895828870.5853160620, 13198299.6345069464, -46353094.4883757681, 1008132723.0642696619, 1132533339.7273423672,
+     190439947.2382000089, -1511701151.4810729027, 446683041.9176963568},
+    {72817530.7555908561, -86737498.6360623688, 463481696.5422678590, 1548820508.3372950554, -139317257.6059397161,
+     2251411999.9036965370, 1218114754.9971106052, 27277800.0651864633},
+    {968646401.3409068584, -73539199.0015554279, 417128602.0538920760, 2556953231.4015645981, 993216082.1214026213,
+     2441851947.1418967247, -293586396.4839622974, 473960841.9828827977}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    int rounding;
+    float32_t expect[16];
+} test_mm512_add_round_ps_data_model;
+static test_mm512_add_round_ps_data_model g_test_mm512_add_round_ps_data = {
+    {55720.105469, 64281.750000, 89742.835938, 73413.921875, 62127.492188, 89540.796875, 91808.859375, 18845.945312,
+     35417.636719, 12672.005859, 48391.230469, 99421.414062, 26799.482422, 18733.376953, 87186.093750, 14267.460938},
+    {51026.980469, 74192.773438, 31054.048828, 40471.296875, 10455.238281, 76734.781250, 35246.054688, 66186.554688,
+     8255.891602, 5566.725098, 77149.703125, 96017.484375, 64436.101562, 87067.187500, 93427.578125, 42545.218750},
+    10,
+    {106747.085938, 138474.531250, 120796.882812, 113885.218750, 72582.734375, 166275.578125, 127054.914062,
+     85032.500000, 43673.527344, 18238.730469, 125540.937500, 195438.906250, 91235.585938, 105800.562500, 180613.671875,
+     56812.679688}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    int rounding;
+    float64_t expect[8];
+} test_mm512_add_round_pd_data_model;
+static test_mm512_add_round_pd_data_model g_test_mm512_add_round_pd_data = {
+    {895828870.5853160620, 13198299.6345069464, -46353094.4883757681, 1008132723.0642696619, 1132533339.7273423672,
+     190439947.2382000089, -1511701151.4810729027, 446683041.9176963568},
+    {72817530.7555908561, -86737498.6360623688, 463481696.5422678590, 1548820508.3372950554, -139317257.6059397161,
+     2251411999.9036965370, 1218114754.9971106052, 27277800.0651864633},
+    10,
+    {968646401.3409068584, -73539199.0015554279, 417128602.0538920760, 2556953231.4015645981, 993216082.1214026213,
+     2441851947.1418967247, -293586396.4839622974, 473960841.9828827977}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+} test_mm512_addn_ps_data_model;
+static test_mm512_addn_ps_data_model g_test_mm512_addn_ps_data = {
+    {55720.105469, 64281.750000, 89742.835938, 73413.921875, 62127.492188, 89540.796875, 91808.859375, 18845.945312,
+     35417.636719, 12672.005859, 48391.230469, 99421.414062, 26799.482422, 18733.376953, 87186.093750, 14267.460938},
+    {51026.980469, 74192.773438, 31054.048828, 40471.296875, 10455.238281, 76734.781250, 35246.054688, 66186.554688,
+     8255.891602, 5566.725098, 77149.703125, 96017.484375, 64436.101562, 87067.187500, 93427.578125, 42545.218750},
+    {-106747.085938, -138474.531250, -120796.882812, -113885.218750, -72582.734375, -166275.578125, -127054.914062,
+     -85032.500000, -43673.527344, -18238.730469, -125540.937500, -195438.906250, -91235.585938, -105800.562500,
+     -180613.671875, -56812.679688}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    float64_t expect[8];
+} test_mm512_addn_pd_data_model;
+static test_mm512_addn_pd_data_model g_test_mm512_addn_pd_data = {
+    {895828870.5853160620, 13198299.6345069464, -46353094.4883757681, 1008132723.0642696619, 1132533339.7273423672,
+     190439947.2382000089, -1511701151.4810729027, 446683041.9176963568},
+    {72817530.7555908561, -86737498.6360623688, 463481696.5422678590, 1548820508.3372950554, -139317257.6059397161,
+     2251411999.9036965370, 1218114754.9971106052, 27277800.0651864633},
+    {-968646401.3409068584, 73539199.0015554279, -417128602.0538920760, -2556953231.4015645981, -993216082.1214026213,
+     -2441851947.1418967247, 293586396.4839622974, -473960841.9828827977}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    int rounding;
+    float32_t expect[16];
+} test_mm512_addn_round_ps_data_model;
+static test_mm512_addn_round_ps_data_model g_test_mm512_addn_round_ps_data = {
+    {55720.105469, 64281.750000, 89742.835938, 73413.921875, 62127.492188, 89540.796875, 91808.859375, 18845.945312,
+     35417.636719, 12672.005859, 48391.230469, 99421.414062, 26799.482422, 18733.376953, 87186.093750, 14267.460938},
+    {51026.980469, 74192.773438, 31054.048828, 40471.296875, 10455.238281, 76734.781250, 35246.054688, 66186.554688,
+     8255.891602, 5566.725098, 77149.703125, 96017.484375, 64436.101562, 87067.187500, 93427.578125, 42545.218750},
+    10,
+    {-106747.085938, -138474.531250, -120796.882812, -113885.218750, -72582.734375, -166275.578125, -127054.914062,
+     -85032.500000, -43673.527344, -18238.730469, -125540.937500, -195438.906250, -91235.585938, -105800.562500,
+     -180613.671875, -56812.679688}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    int rounding;
+    float64_t expect[8];
+} test_mm512_addn_round_pd_data_model;
+static test_mm512_addn_round_pd_data_model g_test_mm512_addn_round_pd_data = {
+    {895828870.5853160620, 13198299.6345069464, -46353094.4883757681, 1008132723.0642696619, 1132533339.7273423672,
+     190439947.2382000089, -1511701151.4810729027, 446683041.9176963568},
+    {72817530.7555908561, -86737498.6360623688, 463481696.5422678590, 1548820508.3372950554, -139317257.6059397161,
+     2251411999.9036965370, 1218114754.9971106052, 27277800.0651864633},
+    10,
+    {-968646401.3409068584, 73539199.0015554279, -417128602.0538920760, -2556953231.4015645981, -993216082.1214026213,
+     -2441851947.1418967247, 293586396.4839622974, -473960841.9828827977}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+    __mmask16 sign;
+} test_mm512_addsetc_epi32_data_model;
+static test_mm512_addsetc_epi32_data_model g_test_mm512_addsetc_epi32_data = {
+    {1, -1, -1, -545454, -2147483648, 313213, -1695550175, -486101411, 1, 16516, -21313, -545454, -2147483648,
+     313213, -1695550175, -486101411},
+    {-1, -1, 0, 1321564, 2147483647, -13213211, 25646321, -1277938685, 1, 321321, -113132, 1321564, 2147483647,
+     -13213211, 25646321, -1277938685},
+    {0, -2, -1, 776110, -1, -12899998, -1669903854, -1764040096, 2, 337837, -134445, 776110, -1, -12899998,
+     -1669903854, -1764040096},
+    0x8c8b};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+    __mmask16 sign;
+} test_mm512_addsets_epi32_data_model;
+static test_mm512_addsets_epi32_data_model g_test_mm512_addsets_epi32_data = {
+    {1, -1, -1, -545454, -2147483648, 313213, -1695550175, -486101411, 1, 16516, -21313, -545454, -2147483648,
+     313213, -1695550175, -486101411},
+    {-1, -1, 0, 1321564, 2147483647, -13213211, 25646321, -1277938685, 1, 321321, -113132, 1321564, 2147483647,
+     -13213211, 25646321, -1277938685},
+    {0, -2, -1, 776110, -1, -12899998, -1669903854, -1764040096, 2, 337837, -134445, 776110, -1, -12899998,
+     -1669903854, -1764040096},
+    0x8482};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+    __mmask16 sign;
+} test_mm512_addsets_ps_data_model;
+static test_mm512_addsets_ps_data_model g_test_mm512_addsets_ps_data = {
+    {0.666720, 0.168380, 0.935157, -0.852902, -0.513789, 0.557619, 0.700522, -0.650639, 0.280436, 0.672595, -0.310473,
+     0.256968, -0.051129, -0.756766, -0.055633, 0.078373},
+    {0.936496, 0.182931, 0.308435, 0.974094, 0.673502, -0.955101, 0.452082, 0.973139, 0.060114, -0.068916, -0.334961,
+     -0.554013, -0.424114, 0.131145, -0.363884, -0.932945},
+    {1.603216, 0.351312, 1.243592, 0.121192, 0.159712, -0.397482, 1.152604, 0.322500, 0.340550, 0.603679, -0.645434,
+     -0.297046, -0.475242, -0.625621, -0.419517, -0.854572},
+    0x5400};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    int rounding;
+    float32_t expect[16];
+    __mmask16 sign;
+} test_mm512_addsets_round_ps_data_model;
+static test_mm512_addsets_round_ps_data_model g_test_mm512_addsets_round_ps_data = {
+    {0.666720, 0.168380, 0.935157, -0.852902, -0.513789, 0.557619, 0.700522, -0.650639, 0.280436, 0.672595, -0.310473,
+     0.256968, -0.051129, -0.756766, -0.055633, 0.078373},
+    {0.936496, 0.182931, 0.308435, 0.974094, 0.673502, -0.955101, 0.452082, 0.973139, 0.060114, -0.068916, -0.334961,
+     -0.554013, -0.424114, 0.131145, -0.363884, -0.932945},
+    10,
+    {1.603216, 0.351312, 1.243592, 0.121192, 0.159712, -0.397482, 1.152604, 0.322500, 0.340550, 0.603679, -0.645434,
+     -0.297046, -0.475242, -0.625621, -0.419517, -0.854572},
+    0x5400};
+
+typedef struct {
+    float32_t a[8];
+    float32_t b[8];
+    float32_t expect[8];
+} test_mm256_addsub_ps_data_model;
+static test_mm256_addsub_ps_data_model g_test_mm256_addsub_ps_data = {
+    {89742.835938, 73413.921875, 62127.492188, 89540.796875, 91808.859375, 18845.945312, 35417.636719, 12672.005859},
+    {51026.980469, 74192.773438, 31054.048828, 66186.554688, 8255.891602, 5566.725098, 77149.703125, 42545.218750},
+    {38715.855469, 147606.687500, 31073.443359, 155727.343750, 83552.968750, 24412.669922, -41732.066406,
+     55217.226562}};
+
+typedef struct {
+    float64_t a[4];
+    float64_t b[4];
+    float64_t expect[4];
+} test_mm256_addsub_pd_data_model;
+static test_mm256_addsub_pd_data_model g_test_mm256_addsub_pd_data = {
+    {13198299.6345069464, -46353094.4883757681, 1008132723.0642696619, 1132533339.7273423672},
+    {463481696.5422678590, 1548820508.3372950554, -139317257.6059397161, 2251411999.9036965370},
+    {-450283396.9077609181, 1502467413.8489193916, 1147449980.6702094078, 3383945339.6310386658}};
+
+typedef struct {
+    float32_t a[8];
+    float32_t b[8];
+    float32_t m[8];
+    float32_t expect[8];
+} test_mm256_blendv_ps_data_model;
+static test_mm256_blendv_ps_data_model g_test_mm256_blendv_ps_data = {
+    {65.3, 0.000452, -547.22235, 989877.4, 4590, -100.23, 546.34, 12},
+    {90.8, 13, 6698.625, 8744.879, 0, 1000, 23.45, -0.00547},
+    {1.00, 0, -23.56, 100.2, -345, -399.45, -888.658, 10.22},
+    {65.300003, 0.000452, 6698.625000, 989877.375000, 0.000000, 1000.000000, 23.450001, 12.000000}};
+
+typedef struct {
+    float64_t a[4];
+    float64_t b[4];
+    float64_t m[4];
+    float64_t expect[4];
+} test_mm256_blendv_pd_data_model;
+static test_mm256_blendv_pd_data_model g_test_mm256_blendv_pd_data = {{65.3, 0.000452, -547.22235, 989877.4},
+                                                                      {90.8, 13, 6698.625, 8744.879},
+                                                                      {1.00, 0, -23.56, 100.2},
+                                                                      {65.300, 0.000452, 6698.6250, 989877.4}};
+
+typedef struct {
+    float32_t a[8];
+    float32_t b[8];
+    int imm;
+    float32_t expect[8];
+} test_mm256_blend_ps_data_model;
+static test_mm256_blend_ps_data_model g_test_mm256_blend_ps_data = {
+    {65.3, 0.000452, -547.22235, 989877.4, 4590, -100.23, 546.34, 12},
+    {90.8, 13, 6698.625, 8744.879, 0, 1000, 23.45, -0.00547},
+    0,
+    {65.300003, 0.000452, -547.222351, 989877.375000, 4590.000000, -100.230003, 546.340027, 12.000000}};
+
+typedef struct {
+    float64_t a[4];
+    float64_t b[4];
+    int imm;
+    float64_t expect[4];
+} test_mm256_blend_pd_data_model;
+static test_mm256_blend_pd_data_model g_test_mm256_blend_pd_data = {{65.3, 0.000452, -547.22235, 989877.4},
+                                                                    {90.8, 13, 6698.625, 8744.879},
+                                                                    1,
+                                                                    {90.800000, 0.000452, -547.222350, 989877.400000}};
+
+typedef struct {
+    __mmask16 k;
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+} test_mm512_mask_blend_ps_data_model;
+static test_mm512_mask_blend_ps_data_model g_test_mm512_mask_blend_ps_data = {
+    0,
+    {10, -9.78, 6.33, 0.45, 0, -13, 30.5, 1, 65.3, 0.000452, -547.22235, 989877.4, 4590, -100.23, 546.34, 12},
+    {100, 20, -7.5, 9.5, 0.2, 3, -65.12, 88.1, 90.8, 13, 6698.625, 8744.879, 0, 1000, 23.45, -0.00547},
+    {10.000000, -9.780000, 6.330000, 0.450000, 0.000000, -13.000000, 30.500000, 1.000000, 65.300003, 0.000452,
+     -547.222351, 989877.375000, 4590.000000, -100.230003, 546.340027, 12.000000}};
+
+typedef struct {
+    __mmask8 k;
+    float64_t a[8];
+    float64_t b[8];
+    float64_t expect[8];
+} test_mm512_mask_blend_pd_data_model;
+static test_mm512_mask_blend_pd_data_model g_test_mm512_mask_blend_pd_data = {
+    32,
+    {10, -9.78, 6.33, 0.45, 0, -13, 30.5, 1},
+    {100, 20, -7.5, 9.5, 0.2, 3, -65.12, 88.1},
+    {10.000000, -9.780000, 6.330000, 0.450000, 0.000000, 3.000000, 30.500000, 1.000000}};
+
+typedef struct {
+    int16_t a[16];
+    int16_t b[16];
+    int16_t expect[16];
+} test_mm256_sub_epi16_data_model;
+static test_mm256_sub_epi16_data_model g_test_mm256_sub_epi16_data = {
+    {29243, 30783, 25393, 13436, -8564, -24992, -29429, 10542, -30544, 20419, 6673, -29899, 13504, -2735, 4096, 31088},
+    {30660, -1312, 2510, 7648, 28728, 19162, 2999, -18632, -24918, -14799, 31045, -12049, 9362, -3054, -2280, 571},
+    {-1417, 32095, 22883, 5788, 28244, 21382, -32428, 29174, -5626, -30318, -24372, -17850, 4142, 319, 6376, 30517}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[8];
+    int32_t expect[8];
+} test_mm256_sub_epi32_data_model;
+static test_mm256_sub_epi32_data_model g_test_mm256_sub_epi32_data = {
+    {2147483647, -2147483648, 4568122, 789, 741852, 32, 0, 852},
+    {-2, 2, 48792, 1236589, -7895412, 32, 15, 23},
+    {-2147483647, 2147483646, 4519330, -1235800, 8637264, 0, -15, 829}};
+
+typedef struct {
+    int64_t a[4];
+    int64_t b[4];
+    int64_t expect[4];
+} test_mm256_sub_epi64_data_model;
+static test_mm256_sub_epi64_data_model g_test_mm256_sub_epi64_data = {
+    {9223372036854775805, -9223372036854775805, 4568122, 789},
+    {-10, 10, 48792, 1236589},
+    {-9223372036854775801, 9223372036854775801, 4519330, -1235800}};
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    int8_t expect[32];
+} test_mm256_sub_epi8_data_model;
+static test_mm256_sub_epi8_data_model g_test_mm256_sub_epi8_data = {
+    {-95, 87,  117, -34, 56,   -19,  113, 106, 2,    -103, -21, 64, 49,  39,  119,  44,
+     98,  -59, -1,  78,  -121, -122, 26,  120, -104, 102,  -71, 22, -64, -53, -105, -49},
+    {-120, -83, 124, 119, -29, 78,  -34, 127,  -89,  74, -88, -109, 25,  -32, 89,  -104,
+     3,    87,  36,  -73, -68, 121, 112, -100, -111, 3,  39,  107,  -67, -73, 120, 121},
+    {25, -86, -7,  103,  85,  -97, -109, -21, 91, 79, 67,   -83, 24, 71, 30, -108,
+     95, 110, -37, -105, -53, 13,  -86,  -36, 7,  99, -110, -85, 3,  20, 31, 86}};
+
+typedef struct {
+    float64_t a[4];
+    float64_t b[4];
+    float64_t expect[4];
+} test_mm256_sub_pd_data_model;
+static test_mm256_sub_pd_data_model g_test_mm256_sub_pd_data = {{1.000000, -0.789450, 67.157000, -145.200000},
+                                                                {0.123895, 1254.687500, 120145.000000, 21.520000},
+                                                                {0.876105, -1255.476950, -120077.843000, -166.720000}};
+
+typedef struct {
+    float32_t a[8];
+    float32_t b[8];
+    float32_t expect[8];
+} test_mm256_sub_ps_data_model;
+static test_mm256_sub_ps_data_model g_test_mm256_sub_ps_data = {
+    {1.500000, 127.199997, 8.000000, -9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480},
+    {4.000000, 20.000000, 2.000000, 3.100000, -10.000000, 5.450000, 47.125469, 34.256100},
+    {-2.500000, 107.199997, 6.000000, -12.662001, -90.000000, -134.295700, 1045433.875000, -27.130619}};
+
+typedef struct {
+    int16_t a[32];
+    int16_t b[32];
+    int16_t expect[32];
+} test_mm512_sub_epi16_data_model;
+static test_mm512_sub_epi16_data_model g_test_mm512_sub_epi16_data = {
+    {-18859, 20464, -27471, -26305, -24156, -3887,  15196, 23902,  -20918, 19080, 21522,
+     -32031, 9623,  26035,  -12362, -29218, 1563,   21128, 3676,   31916,  7853,  -30705,
+     -18084, 9343,  -275,   -16867, -16319, -25163, 6253,  -12397, 27971,  -4561},
+    {7232,  -13187, 12429, -7712,  13030,  -10226, 19422,  -13707, 6528,  22135, 20793,
+     19126, -10386, -5701, 27098,  1547,   24015,  6860,   27591,  12289, -4738, -9718,
+     25965, -6234,  -4345, -11521, -16129, 26072,  -31896, 26659,  -8847, -3234},
+    {-26091, -31885, 25636, -18593, 28350,  6339,   -4226,  -27927, -27446, -3055, 729,
+     14379,  20009,  31736, 26076,  -30765, -22452, 14268,  -23915, 19627,  12591, -20987,
+     21487,  15577,  4070,  -5346,  -190,   14301,  -27387, 26480,  -28718, -1327}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_sub_epi32_data_model;
+static test_mm512_sub_epi32_data_model g_test_mm512_sub_epi32_data = {
+    {-2147483647, 2147483645, -27471, -26305, -243156, -3887, 15196, 23902, -20918, 190380, 21522, -32031, 96233,
+     260335, -12362, -292318},
+    {8, -10, 12429, -7712, 13030, -10226, 193422, -13707, 6528, 22135, 20793, 191326, -10386, -5701, 27098, 15347},
+    {2147483641, -2147483641, -39900, -18593, -256186, 6339, -178226, 37609, -27446, 168245, 729, -223357, 106619,
+     266036, -39460, -307665}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[8];
+    int64_t expect[8];
+} test_mm512_sub_epi64_data_model;
+static test_mm512_sub_epi64_data_model g_test_mm512_sub_epi64_data = {
+    {-9223372036854775805, 9223372036854775804, -274877871, -2654305, -2431275156, -3845487, 1587196, 2390272},
+    {20, -10, 1242954575787, -7744412, 137842030, -104557226, 1587196, -1378707},
+    {9223372036854775791, -9223372036854775802, -1243229453658, 5090107, -2569117186, 100711739, 0, 3768979}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    int8_t expect[64];
+} test_mm512_sub_epi8_data_model;
+static test_mm512_sub_epi8_data_model g_test_mm512_sub_epi8_data = {
+    {-43, 112,  49,  -65, 36,  81, -36, -34, -54,  8,   -110, 97,  23, 51,  54,  94,  -101, 8,   -36, 44,   45,  -113,
+     -36, -1,   109, -99, -63, 53, -19, 19,  -61,  -81, -64,  -86, -1, 30,  -44, -74, -102, -42, 61,  -122, -13, 86,
+     -92, -124, 90,  69,  106, 56, 69,  22,  -125, 74,  2,    -92, 21, -67, 101, 104, -4,   -16, -97, -15},
+    {-64, -3,  13,  96,   102, -114, 94,  -11,  0,    -9,   -71,  54,   -18, 59,  90,  -117,
+     79,  76,  71,  -127, -2,  -118, -19, 38,   -121, 127,  127,  88,   -24, -93, -15, -34,
+     55,  29,  86,  -84,  45,  48,   53,  -121, -12,  126,  59,   96,   -23, -25, -77, -101,
+     111, -23, -42, -102, 76,  -99,  -95, -65,  -86,  -120, -109, -119, 108, -42, -75, -118},
+    {21,  115, 36,  95,  -66, -61, 126, -23, -54, 17,  -39,  43,   41,  -8,  -36, -45,  76,   -68, -107, -85, 47,  5,
+     -17, -39, -26, 30,  66,  -35, 5,   112, -46, -47, -119, -115, -87, 114, -89, -122, 101,  79,  73,   8,   -72, -10,
+     -69, -99, -89, -86, -5,  79,  111, 124, 55,  -83, 97,   -27,  107, 53,  -46, -33,  -112, 26,  -22,  103}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    float64_t expect[8];
+} test_mm512_sub_pd_data_model;
+static test_mm512_sub_pd_data_model g_test_mm512_sub_pd_data = {
+    {45.014000, 49454.000000, 120.000000, -1451548.000000, -145.200000, 112.000000, 11.400000, 457788.000000},
+    {0.000125, 7.454500, 20.000000, 0.020000, 21.520000, 5145.000000, 65421.000000, 11.250000},
+    {45.013875, 49446.545500, 100.000000, -1451548.020000, -166.720000, -5033.000000, -65409.600000, 457776.750000}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+} test_mm512_sub_ps_data_model;
+static test_mm512_sub_ps_data_model g_test_mm512_sub_ps_data = {
+    {1.550000, 127.245003, 8.000000, -9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480, 16.145000,
+     20.000999, 58.698002, 57.146999, 67.156998, 30.124500, 40.658001, 14.650000},
+    {4.045000, 20.540001, 2.545000, 3.100000, -10.054500, 5.450000, 47.125469, 34.256100, 12.012000, 20.000999,
+     58.698002, 4487.120117, 120145.000000, 11.047000, 12.200000, 0.001400},
+    {-2.495000, 106.705002, 5.455000, -12.662001, -89.945503, -134.295700, 1045433.875000, -27.130619, 4.133000,
+     0.000000, 0.000000, -4429.973145, -120077.843750, 19.077499, 28.458000, 14.648600}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    float64_t expect[8];
+} test_mm512_sub_round_pd_data_model;
+static test_mm512_sub_round_pd_data_model g_test_mm512_sub_round_pd_data = {
+    {45.014000, 49454.000000, 120.000000, -1451548.000000, -145.200000, 112.000000, 11.400000, 457788.000000},
+    {0.000125, 7.454500, 20.000000, 0.020000, 21.520000, 5145.000000, 65421.000000, 11.250000},
+    {45.013875, 49446.545500, 100.000000, -1451548.020000, -166.720000, -5033.000000, -65409.600000, 457776.750000}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+} test_mm512_sub_round_ps_data_model;
+static test_mm512_sub_round_ps_data_model g_test_mm512_sub_round_ps_data = {
+    {1.550000, 127.245003, 8.000000, -9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480, 16.145000,
+     20.000000, 58.698002, 57.146999, 67.156998, 30.124500, 40.658001, 14.650000},
+    {4.045000, 20.540001, 2.545000, 3.100000, -10.054500, 5.450000, -47.125469, 34.256100, 12.012000, 20.000999,
+     58.698002, 4487.120117, 120145.000000, 11.047000, 12.200000, 0.001400},
+    {-2.495000, 106.705002, 5.455000, -12.662001, -89.945503, -134.295700, 1045528.125000, -27.130619, 4.133000,
+     -0.000999, 0.000000, -4429.973145, -120077.843750, 19.077499, 28.458000, 14.648600}};
+
+typedef struct {
+    int16_t a[16];
+    int16_t b[16];
+    int16_t expect[16];
+} test_mm256_subs_epi16_data_model;
+static test_mm256_subs_epi16_data_model g_test_mm256_subs_epi16_data = {
+    {-18859, 20464, -27471, -26305, -24156, -3887, 15196, 23902, -20918, 19080, 21522, -32031, 9623, 26035, -12362,
+     -29218},
+    {7232, -13187, 12429, -7712, 13030, -10226, 19422, -13707, 6528, 22135, 20793, 19126, -10386, -5701, 27098, 1547},
+    {-26091, 32767, -32768, -18593, -32768, 6339, -4226, 32767, -27446, -3055, 729, -32768, 20009, 31736, -32768,
+     -30765}};
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    int8_t expect[32];
+} test_mm256_subs_epi8_data_model;
+static test_mm256_subs_epi8_data_model g_test_mm256_subs_epi8_data = {
+    {-43,  112, 49,  -65, 36, 81,   -36, -34, -54, 8,   -110, 97, 23,  51, 54,  94,
+     -101, 8,   -36, 44,  45, -113, -36, -1,  109, -99, -63,  53, -19, 19, -61, -81},
+    {-64, -3, 13, 96,   102, -114, 94,  -11, 0,    -9,  -71, 54, -18, 59,  90,  -117,
+     79,  76, 71, -127, -2,  -118, -19, 38,  -121, 127, 127, 88, -24, -93, -15, -34},
+    {21,   115, 36,   -128, -66, 127, -128, -23, -54, 17,   -39,  43,  41, -8,  -36, 127,
+     -128, -68, -107, 127,  47,  5,   -17,  -39, 127, -128, -128, -35, 5,  112, -46, -47}};
+
+typedef struct {
+    uint16_t a[16];
+    uint16_t b[16];
+    uint16_t expect[16];
+} test_mm256_subs_epu16_data_model;
+static test_mm256_subs_epu16_data_model g_test_mm256_subs_epu16_data = {
+    {48115, 53651, 11817, 10729, 43624, 5083, 47771, 73, 33342, 33488, 60520, 62003, 28030, 18563, 21829, 15359},
+    {56328, 10349, 60556, 52706, 57882, 45737, 9403, 27268, 35617, 46221, 52222, 55106, 35299, 51054, 18674, 4409},
+    {0, 43302, 0, 0, 0, 0, 38368, 0, 0, 0, 8298, 6897, 0, 0, 3155, 10950}};
+
+typedef struct {
+    uint8_t a[32];
+    uint8_t b[32];
+    uint8_t expect[32];
+} test_mm256_subs_epu8_data_model;
+static test_mm256_subs_epu8_data_model g_test_mm256_subs_epu8_data = {
+    {243, 147, 41, 233, 104, 219, 155, 73,  62,  208, 104, 51, 126, 131, 69,  255,
+     250, 166, 31, 5,   169, 48,  156, 200, 155, 104, 150, 69, 126, 114, 186, 253},
+    {8,   109, 140, 226, 26,  169, 187, 132, 33,  141, 254, 66, 227, 94,  242, 57,
+     146, 35,  143, 135, 224, 68,  121, 218, 152, 3,   155, 20, 200, 196, 114, 180},
+    {235, 38,  0, 7, 78, 50, 0,  0, 29, 67,  0, 0,  0, 37, 0,  198,
+     104, 131, 0, 0, 0,  0,  35, 0, 3,  101, 0, 49, 0, 0,  72, 73}};
+
+typedef struct {
+    int16_t a[32];
+    int16_t b[32];
+    int16_t expect[32];
+} test_mm512_subs_epi16_data_model;
+static test_mm512_subs_epi16_data_model g_test_mm512_subs_epi16_data = {
+    {-18859, 20464, -27471, -26305, -24156, -3887,  15196, 23902,  -20918, 19080, 21522,
+     -32031, 9623,  26035,  -12362, -29218, 1563,   21128, 3676,   31916,  7853,  -30705,
+     -18084, 9343,  -275,   -16867, -16319, -25163, 6253,  -12397, 27971,  -4561},
+    {7232,  -13187, 12429, -7712,  13030,  -10226, 19422,  -13707, 6528,  22135, 20793,
+     19126, -10386, -5701, 27098,  1547,   24015,  6860,   27591,  12289, -4738, -9718,
+     25965, -6234,  -4345, -11521, -16129, 26072,  -31896, 26659,  -8847, -3234},
+    {-26091, 32767, -32768, -18593, -32768, 6339,   -4226, 32767,  -27446, -3055, 729,
+     -32768, 20009, 31736,  -32768, -30765, -22452, 14268, -23915, 19627,  12591, -20987,
+     -32768, 15577, 4070,   -5346,  -190,   -32768, 32767, -32768, 32767,  -1327}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    int8_t expect[64];
+} test_mm512_subs_epi8_data_model;
+static test_mm512_subs_epi8_data_model g_test_mm512_subs_epi8_data = {
+    {-43, 112,  49,  -65, 36,  81, -36, -34, -54,  8,   -110, 97,  23, 51,  54,  94,  -101, 8,   -36, 44,   45,  -113,
+     -36, -1,   109, -99, -63, 53, -19, 19,  -61,  -81, -64,  -86, -1, 30,  -44, -74, -102, -42, 61,  -122, -13, 86,
+     -92, -124, 90,  69,  106, 56, 69,  22,  -125, 74,  2,    -92, 21, -67, 101, 104, -4,   -16, -97, -15},
+    {-64, -3,  13,  96,   102, -114, 94,  -11,  0,    -9,   -71,  54,   -18, 59,  90,  -117,
+     79,  76,  71,  -127, -2,  -118, -19, 38,   -121, 127,  127,  88,   -24, -93, -15, -34,
+     55,  29,  86,  -84,  45,  48,   53,  -121, -12,  126,  59,   96,   -23, -25, -77, -101,
+     111, -23, -42, -102, 76,  -99,  -95, -65,  -86,  -120, -109, -119, 108, -42, -75, -118},
+    {21,   115,  36,   -128, -66,  127,  -128, -23, -54, 17,   -39,  43,  41,   -8,  -36, 127,
+     -128, -68,  -107, 127,  47,   5,    -17,  -39, 127, -128, -128, -35, 5,    112, -46, -47,
+     -119, -115, -87,  114,  -89,  -122, -128, 79,  73,  -128, -72,  -10, -69,  -99, 127, 127,
+     -5,   79,   111,  124,  -128, 127,  97,   -27, 107, 53,   127,  127, -112, 26,  -22, 103}};
+
+typedef struct {
+    uint16_t a[32];
+    uint16_t b[32];
+    uint16_t expect[32];
+} test_mm512_subs_epu16_data_model;
+static test_mm512_subs_epu16_data_model g_test_mm512_subs_epu16_data = {
+    {48115, 53651, 11817, 10729, 43624, 5083,  47771, 73,    33342, 33488, 60520, 62003, 28030, 18563, 21829, 15359,
+     9466,  14758, 64031, 8453,  17833, 22320, 22940, 27336, 58011, 36968, 47510, 42565, 3198,  51058, 14266, 29437},
+    {56328, 10349, 60556, 52706, 57882, 45737, 9403,  27268, 35617, 46221, 52222, 55106, 35299, 51054, 18674, 4409,
+     3474,  21283, 25487, 42119, 13536, 68,    22393, 56282, 60824, 52995, 33435, 10004, 61128, 25028, 882,   23732},
+    {0,    43302, 0,     0, 0,    0,     38368, 0, 0, 0, 8298,  6897,  0, 0,     3155,  10950,
+     5992, 0,     38544, 0, 4297, 22252, 547,   0, 0, 0, 14075, 32561, 0, 26030, 13384, 5705}};
+
+typedef struct {
+    uint8_t a[64];
+    uint8_t b[64];
+    uint8_t expect[64];
+} test_mm512_subs_epu8_data_model;
+static test_mm512_subs_epu8_data_model g_test_mm512_subs_epu8_data = {
+    {243, 147, 41,  233, 104, 219, 155, 73,  62,  208, 104, 51,  126, 131, 69,  255, 250, 166, 31,  5,   169, 48,
+     156, 200, 155, 104, 150, 69,  126, 114, 186, 253, 4,   215, 50,  170, 189, 32,  84,  52,  129, 242, 55,  44,
+     0,   122, 16,  111, 26,  56,  36,  39,  190, 59,  48,  2,   11,  200, 44,  248, 147, 232, 233, 184},
+    {8,   109, 140, 226, 26,  169, 187, 132, 33,  141, 254, 66,  227, 94,  242, 57, 146, 35,  143, 135, 224, 68,
+     121, 218, 152, 3,   155, 20,  200, 196, 114, 180, 163, 35,  221, 219, 219, 89, 232, 239, 156, 23,  56,  182,
+     158, 187, 119, 21,  71,  76,  227, 225, 71,  18,  111, 177, 244, 66,  244, 45, 114, 163, 87,  3},
+    {235, 38, 0, 7,   78, 50, 0, 0, 29,  67, 0, 0,   0, 37,  0, 198, 104, 131, 0,   0,   0, 0,
+     35,  0,  3, 101, 0,  49, 0, 0, 72,  73, 0, 180, 0, 0,   0, 0,   0,   0,   0,   219, 0, 0,
+     0,   0,  0, 90,  0,  0,  0, 0, 119, 41, 0, 0,   0, 134, 0, 203, 33,  69,  146, 181}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_subr_epi32_data_model;
+static test_mm512_subr_epi32_data_model g_test_mm512_subr_epi32_data = {
+    {8, -10, 12429, -7712, 13030, -10226, 193422, -13707, 6528, 22135, 20793, 191326, -10386, -5701, 27098, 15347},
+    {-2147483647, 2147483645, -27471, -26305, -243156, -3887, 15196, 23902, -20918, 190380, 21522, -32031, 96233,
+     260335, -12362, -292318},
+    {2147483641, -2147483641, -39900, -18593, -256186, 6339, -178226, 37609, -27446, 168245, 729, -223357, 106619,
+     266036, -39460, -307665}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    float64_t expect[8];
+} test_mm512_subr_pd_data_model;
+static test_mm512_subr_pd_data_model g_test_mm512_subr_pd_data = {
+    {0.000125, 7.454500, 20.000000, 0.020000, 21.520000, 5145.000000, 65421.000000, 11.250000},
+    {45.014000, 49454.000000, 120.000000, -1451548.000000, -145.200000, 112.000000, 11.400000, 457788.000000},
+    {45.013875, 49446.545500, 100.000000, -1451548.020000, -166.720000, -5033.000000, -65409.600000, 457776.750000}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+} test_mm512_subr_ps_data_model;
+static test_mm512_subr_ps_data_model g_test_mm512_subr_ps_data = {
+    {4.045000, 20.540001, 2.545000, 3.100000, -10.054500, 5.450000, 47.125469, 34.256100, 12.012000, 20.000999,
+     58.698002, 4487.120117, 120145.000000, 11.047000, 12.200000, 0.001400},
+    {1.550000, 127.245003, 8.000000, -9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480, 16.145000,
+     20.000999, 58.698002, 57.146999, 67.156998, 30.124500, 40.658001, 14.650000},
+    {-2.495000, 106.705002, 5.455000, -12.662001, -89.945503, -134.295700, 1045433.875000, -27.130619, 4.133000,
+     0.000000, 0.000000, -4429.973145, -120077.843750, 19.077499, 28.458000, 14.648600}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    float64_t expect[8];
+} test_mm512_subr_round_pd_data_model;
+static test_mm512_subr_round_pd_data_model g_test_mm512_subr_round_pd_data = {
+    {0.000125, 7.454500, 20.000000, 0.020000, 21.520000, 5145.000000, 65421.000000, 11.250000},
+    {45.014000, 49454.000000, 120.000000, -1451548.000000, -145.200000, 112.000000, 11.400000, 457788.000000},
+    {45.013875, 49446.545500, 100.000000, -1451548.020000, -166.720000, -5033.000000, -65409.600000, 457776.750000}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+} test_mm512_subr_round_ps_data_model;
+static test_mm512_subr_round_ps_data_model g_test_mm512_subr_round_ps_data = {
+    {4.045000, 20.540001, 2.545000, 3.100000, -10.054500, 5.450000, -47.125469, 34.256100, 12.012000, 20.000999,
+     58.698002, 4487.120117, 120145.000000, 11.047000, 12.200000, 0.001400},
+    {1.550000, 127.245003, 8.000000, -9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480, 16.145000,
+     20.000000, 58.698002, 57.146999, 67.156998, 30.124500, 40.658001, 14.650000},
+    {-2.495000, 106.705002, 5.455000, -12.662001, -89.945503, -134.295700, 1045528.125000, -27.130619, 4.133000,
+     -0.000999, 0.000000, -4429.973145, -120077.843750, 19.077499, 28.458000, 14.648600}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+    __mmask16 borrow;
+} test_mm512_subsetb_epi32_data_model;
+static test_mm512_subsetb_epi32_data_model g_test_mm512_subsetb_epi32_data = {
+    {-1, 2147483645, -27471, -26305, -243156, -3887, 15196, 23902, -20918, 190380, 21522, -32031, 96233,
+     260335, -12362, -292318},
+    {0, -10, 12429, -7712, 13030, -10226, 193422, -13707, 6528, 22135, 20793, 191326, -10386, -5701, 27098, 15347},
+    {-1, -2147483641, -39900, -18593, -256186, 6339, -178226, 37609, -27446, 168245, 729, -223357, 106619,
+     266036, -39460, -307665},
+    0x30ca};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+    __mmask16 borrow;
+} test_mm512_subrsetb_epi32_data_model;
+static test_mm512_subrsetb_epi32_data_model g_test_mm512_subrsetb_epi32_data = {
+    {8, -10, 12429, -7712, 13030, -10226, 193422, -13707, 6528, 22135, 20793, 191326, -10386, -5701, 27098, 15347},
+    {-2147483647, 2147483645, -27471, -26305, -243156, -3887, 15196, 23902, -20918, 190380, 21522, -32031, 96233,
+     260335, -12362, -292318},
+    {2147483641, -2147483641, -39900, -18593, -256186, 6339, -178226, 37609, -27446, 168245, 729, -223357, 106619,
+     266036, -39460, -307665},
+    0x30ca};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[8];
+    int64_t expect[8];
+} test_mm512_permutexvar_epi64_data_model;
+static test_mm512_permutexvar_epi64_data_model g_test_mm512_permutexvar_epi64_data = {
+    {-9223372036854775805, 9223372036854775804, -274877871, -2654305, -2431275156, -3845487, 1587196, 2390272},
+    {20, -10, 1242954575787, -7744412, 137842030, -104557226, 1587196, -1378707},
+    {-7744412, 137842030, -10, -1378707, 137842030, -10, 137842030, 20}};
+
+typedef struct {
+    int32_t a[16];
+    const int imm8;
+    int32_t expect[4];
+} test_mm512_extracti32x4_epi32_data_model;
+static test_mm512_extracti32x4_epi32_data_model g_test_mm512_extracti32x4_epi32_data = {
+    {1, 1, 1, 1, 1, 2, 4, 8, 16, 4, 5, 8, 10, 5, 4, 8}, 2, {16, 4, 5, 8}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    __mmask64 expect;
+} test_mm512_test_epi8_mask_data_model;
+static test_mm512_test_epi8_mask_data_model g_test_mm512_test_epi8_mask_data = {
+    {88,   80,  -45, 59,  -96, 21,  -105, 88,  -38,  -70, -5,  -69, -117, -48,  -119, 48,
+     -122, -11, -37, 39,  -48, 108, -53,  57,  22,   -13, -90, 50,  -103, 112,  -2,   -125,
+     -95,  93,  40,  119, 65,  -31, 41,   49,  -115, -48, 83,  -26, -124, -114, 119,  -122,
+     9,    21,  -24, -85, -15, -75, 38,   -39, -118, 103, 44,  74,  36,   -77,  106,  52},
+    {10,  20, 90,   -50,  -96, -15, -114, -47, 81, 78,   -77, 32, -22, 62,  -4,  97,   0,    -39, -80, -5,  -68,  -24,
+     69,  37, 115,  -111, 47,  -78, -126, -94, 33, 4,    -8,  -4, 4,   121, 100, -116, -102, -65, -92, -76, -126, 108,
+     -42, 2,  -111, -103, 99,  -78, 13,   -87, 12, -102, 102, 51, 41,  94,  -51, 48,   89,   27,  -71, -12},
+    16712858197844492287ULL};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    __mmask16 expect;
+} test_mm512_test_epi32_mask_data_model;
+static test_mm512_test_epi32_mask_data_model g_test_mm512_test_epi32_mask_data = {
+    {15196, 23902, -20918, 545, 45, 0, -1, 489, 444, 9998, 21522, -32031, 96233, 260335, -12362, -292318},
+    {193422, -13707, 6528, 89, 893214, 47, 558745, 555, 555, 52864, 20793, 191326, -10386, -5701, 27098, 15347},
+    65503};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[8];
+    __mmask8 expect;
+} test_mm512_test_epi64_mask_data_model;
+static test_mm512_test_epi64_mask_data_model g_test_mm512_test_epi64_mask_data = {
+    {444, 9998, 21522, -32031, 96233, 260335, -12362, -292318},
+    {5888755, 52864, 20793, 191326, -10386, -5701, 27098, 15347},
+    255};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[8];
+    int32_t expect[8];
+} test_mm256_mul_epi32_data_model;
+static test_mm256_mul_epi32_data_model g_test_mm256_mul_epi32_data = {
+    {-2147483647, 2147483645, -27471, -26305, -243156, -3887, 15196, 23902},
+    {8, -10, 12429, -7712, 13030, -10226, 193422, -13707},
+    {8, -4, -341437059, -1, 1126644616, -1, -1355726584, 0}};
+
+typedef struct {
+    uint32_t a[8];
+    uint32_t b[8];
+    uint32_t expect[8];
+} test_mm256_mul_epu32_data_model;
+static test_mm256_mul_epu32_data_model g_test_mm256_mul_epu32_data = {
+    {4294967290, 2147483645, 27471, 26305, 243156, 3887, 15196, 23902},
+    {8, 8, 12429, 7712, 0, 1, 193422, 13707},
+    {4294967248, 7, 341437059, 0, 0, 0, 2939240712, 0}};
+
+typedef struct {
+    float64_t a[4];
+    float64_t b[4];
+    float64_t expect[4];
+} test_mm256_mul_pd_data_model;
+static test_mm256_mul_pd_data_model g_test_mm256_mul_pd_data = {
+    {1.550000, 127.245000, 8.000000, -9.562000},
+    {4.045000, 20.540000, 2.545000, 3.100000},
+    {6.269750000, 2613.612300000, 20.360000000, -29.642200000}};
+
+typedef struct {
+    float32_t a[8];
+    float32_t b[8];
+    float32_t expect[8];
+} test_mm256_mul_ps_data_model;
+static test_mm256_mul_ps_data_model g_test_mm256_mul_ps_data = {
+    {1.550000, 127.245003, 8.000000, -9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480},
+    {4.045000, 20.540001, 2.545000, 3.100000, -10.054500, 5.450000, -47.125469, 34.256100},
+    {6.269750, 2613.612549, 20.360001, -29.642200, 1005.449951, -702.209045, -49268784.000000, 244.091156}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_mul_epi32_data_model;
+static test_mm512_mul_epi32_data_model g_test_mm512_mul_epi32_data = {
+    {-2147483647, 2147483645, -27471, -26305, -243156, -3887, 15196, 23902, -20918, 190380, 21522, -32031, 96233,
+     260335, -12362, -292318},
+    {8, -10, 12429, -7712, 13030, -10226, 193422, -13707, 6528, 22135, 20793, 191326, -10386, -5701, 27098, 15347},
+    {8, -4, -341437059, -1, 1126644616, -1, -1355726584, 0, -136552704, -1, 447506946, 0, -999475938, -1, -334985476,
+     -1}};
+
+typedef struct {
+    uint32_t a[16];
+    uint32_t b[16];
+    uint32_t expect[16];
+} test_mm512_mul_epu32_data_model;
+static test_mm512_mul_epu32_data_model g_test_mm512_mul_epu32_data = {
+    {4294967290, 2147483645, 27471, 26305, 243156, 3887, 15196, 23902, 20918, 190380, 21522, 32031, 96233, 260335,
+     12362, 292318},
+    {8, 8, 12429, 7712, 0, 1, 193422, 13707, 6528, 22135, 20793, 191326, 10386, 5701, 27098, 15347},
+    {4294967248, 7, 341437059, 0, 0, 0, 2939240712, 0, 136552704, 0, 447506946, 0, 999475938, 0, 334985476, 0}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    float64_t expect[8];
+} test_mm512_mul_pd_data_model;
+static test_mm512_mul_pd_data_model g_test_mm512_mul_pd_data = {
+    {1.550000, 127.245000, 8.000000, -9.562000, -100.000000, -128.845700, 1045481.000000, 7.125480},
+    {4.045000, 20.540000, 2.545000, 3.100000, -10.054500, 5.450000, -47.125470, 34.256100},
+    {6.269750000, 2613.612300000, 20.360000000, -29.642200000, 1005.450000000, -702.209065000, -49268783.501070000,
+     244.091155428}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+} test_mm512_mul_ps_data_model;
+static test_mm512_mul_ps_data_model g_test_mm512_mul_ps_data = {
+    {1.550000, 127.245003, 8.000000, -9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480, 16.145000,
+     20.000000, 58.698002, 57.146999, 67.156998, 30.124500, 40.658001, 0.000000},
+    {4.045000, 20.540001, 2.545000, 3.100000, -10.054500, 5.450000, -47.125469, 34.256100, 12.012000, 20.000999,
+     58.698002, 4487.120117, 120145.000000, 11.047000, 12.200000, 0.004580},
+    {6.269750, 2613.612549, 20.360001, -29.642200, 1005.449951, -702.209045, -49268784.000000, 244.091156, 193.933746,
+     400.019989, 3445.455322, 256425.453125, 8068577.500000, 332.785339, 496.027618, 0.000000}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    float64_t expect[8];
+} test_mm512_mul_round_pd_data_model;
+static test_mm512_mul_round_pd_data_model g_test_mm512_mul_round_pd_data = {
+    {-9.562000, -100.000000, -128.845700, 1045481.000000, 7.125480, 16.145000, 20.000000, 58.698000},
+    {3.100000, -10.054500, 5.450000, -47.125470, 34.256100, 12.012000, 20.001000, 58.698000},
+    {-29.642200000, 1005.450000000, -702.209065000, -49268783.501070000, 244.091155428, 193.933740000, 400.020000000,
+     3445.455204000}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    float32_t expect[16];
+} test_mm512_mul_round_ps_data_model;
+static test_mm512_mul_round_ps_data_model g_test_mm512_mul_round_ps_data = {
+    {-9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480, 16.145000, 20.000000, 58.698002, 57.146999,
+     67.156998, 30.124500, 40.658001, 0.000000, 1.110000, 528.890015, 8.000000},
+    {3.100000, -10.054500, 5.450000, -47.125469, 34.256100, 12.012000, 20.000999, 58.698002, 4487.120117, 120145.000000,
+     11.047000, 12.200000, 0.004580, 4.400000, 11.890000, 8925.500000},
+    {-29.642200, 1005.449951, -702.209045, -49268784.000000, 244.091156, 193.933746, 400.019989, 3445.455322,
+     256425.453125, 8068577.500000, 332.785339, 496.027618, 0.000000, 4.884000, 6288.502441, 71404.000000}};
+
+typedef struct {
+    int16_t a[16];
+    int16_t b[16];
+    int16_t expect[16];
+} test_mm256_mulhi_epi16_data_model;
+static test_mm256_mulhi_epi16_data_model g_test_mm256_mulhi_epi16_data = {
+    {-18859, 20464, -27471, -26305, -24156, -3887, 15196, 23902, -20918, 19080, 21522, -32031, 9623, 26035, -12362,
+     -29218},
+    {7232, -13187, 12429, -7712, 13030, -10226, 19422, -13707, 6528, 22135, 20793, 19126, -10386, -5701, 27098, 1547},
+    {-2082, -4118, -5210, 3095, -4803, 606, 4503, -5000, -2084, 6444, 6828, -9348, -1526, -2265, -5112, -690}};
+
+typedef struct {
+    uint16_t a[16];
+    uint16_t b[16];
+    uint16_t expect[16];
+} test_mm256_mulhi_epu16_data_model;
+static test_mm256_mulhi_epu16_data_model g_test_mm256_mulhi_epu16_data = {
+    {48115, 53651, 11817, 10729, 43624, 5083, 47771, 73, 33342, 33488, 60520, 62003, 28030, 18563, 21829, 15359},
+    {56328, 10349, 60556, 52706, 57882, 45737, 9403, 27268, 35617, 46221, 52222, 55106, 35299, 51054, 18674, 4409},
+    {41354, 8472, 10919, 8628, 38529, 3547, 6854, 30, 18120, 23618, 48225, 52135, 15097, 14460, 6220, 1033}};
+
+typedef struct {
+    int16_t a[32];
+    int16_t b[32];
+    int16_t expect[32];
+} test_mm512_mulhi_epi16_data_model;
+static test_mm512_mulhi_epi16_data_model g_test_mm512_mulhi_epi16_data = {
+    {-18859, 20464, -27471, -26305, -24156, -3887,  15196, 23902,  -20918, 19080, 21522,
+     -32031, 9623,  26035,  -12362, -29218, 1563,   21128, 3676,   31916,  7853,  -30705,
+     -18084, 9343,  -275,   -16867, -16319, -25163, 6253,  -12397, 27971,  -4561},
+    {7232,  -13187, 12429, -7712,  13030,  -10226, 19422,  -13707, 6528,  22135, 20793,
+     19126, -10386, -5701, 27098,  1547,   24015,  6860,   27591,  12289, -4738, -9718,
+     25965, -6234,  -4345, -11521, -16129, 26072,  -31896, 26659,  -8847, -3234},
+    {-2082, -4118, -5210, 3095, -4803, 606,  4503,  -5000, -2084, 6444, 6828, -9348,  -1526, -2265, -5112, -690,
+     572,   2211,  1547,  5984, -568,  4553, -7165, -889,  18,    2965, 4016, -10011, -3044, -5043, -3776, 225}};
+
+typedef struct {
+    uint16_t a[32];
+    uint16_t b[32];
+    uint16_t expect[32];
+} test_mm512_mulhi_epu16_data_model;
+static test_mm512_mulhi_epu16_data_model g_test_mm512_mulhi_epu16_data = {
+    {48115, 53651, 11817, 10729, 43624, 5083,  47771, 73,    33342, 33488, 60520, 62003, 28030, 18563, 21829, 15359,
+     9466,  14758, 64031, 8453,  17833, 22320, 22940, 27336, 58011, 36968, 47510, 42565, 3198,  51058, 14266, 29437},
+    {56328, 10349, 60556, 52706, 57882, 45737, 9403,  27268, 35617, 46221, 52222, 55106, 35299, 51054, 18674, 4409,
+     3474,  21283, 25487, 42119, 13536, 68,    22393, 56282, 60824, 52995, 33435, 10004, 61128, 25028, 882,   23732},
+    {41354, 8472, 10919, 8628, 38529, 3547, 6854, 30,    18120, 23618, 48225, 52135, 15097, 14460, 6220, 1033,
+     501,   4792, 24901, 5432, 3683,  23,   7838, 23476, 53840, 29893, 24238, 6497,  2982,  19498, 191,  10659}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_mulhi_epi32_data_model;
+static test_mm512_mulhi_epi32_data_model g_test_mm512_mulhi_epi32_data = {
+    {-2044617900, -567095272, -1177534049, 1579247274, -1775512468, -1288079916, 1157964942, -1205455363, -1227830238,
+     2038814254, -740634116, -1381961705, -961458146, -2141144538, 1174921389, 288064003},
+    {1663542419, 551916044, 1883567592, -910700319, 760327568, -343608356, 1203858730, -1715618161, 1316989780,
+     -2088110026, 403733409, -1171448665, -44626873, 2031576200, -977291887, -1335482720},
+    {-791928873, -72873426, -516410213, -334861921, -314314635, 103049683, 324572018, 481517313, -376496435, -991222562,
+     -69620725, 376928875, 9990034, -1012789618, -267345725, -89570996}};
+
+typedef struct {
+    uint32_t a[16];
+    uint32_t b[16];
+    uint32_t expect[16];
+} test_mm512_mulhi_epu32_data_model;
+static test_mm512_mulhi_epu32_data_model g_test_mm512_mulhi_epu32_data = {
+    {2962023422, 1580466109, 1592432053, 2030319111, 1978641068, 3761711726, 678013506, 2334737703, 1766952438,
+     1973346953, 1732746277, 2470577604, 1481071869, 1847054851, 2990013849, 2379125171},
+    {2724133815, 2540255767, 2887352417, 3246089225, 3137576777, 174631087, 2587136935, 1609460072, 3240741195,
+     2232628701, 2881515244, 3109730594, 1499924219, 1620516098, 858852952, 2031578088},
+    {1878698394, 934765708, 1070534935, 1534492939, 1445444828, 152949199, 408411441, 874900052, 1333243110, 1025793851,
+     1162508225, 1788798430, 517232242, 696904519, 597904953, 1125358642}};
+
+typedef struct {
+    int16_t a[16];
+    int16_t b[16];
+    int16_t expect[16];
+} test_mm256_mullo_epi16_data_model;
+static test_mm256_mullo_epi16_data_model g_test_mm256_mullo_epi16_data = {
+    {-18859, 32767, -27471, -26305, 32764, -3887, 15196, 23902, -20918, -32764, 21522, -32031, 9623, 26035, -12362,
+     -29218},
+    {7232, -2, 12429, -7712, 8, -10226, 19422, -13707, 6528, 8, 20793, 19126, -10386, -5701, 27098, 1547},
+    {-7872, 2, 5501, 30240, -32, -31890, 28104, -10250, 24320, 32, 27138, 5622, -2078, 13505, -30980, 19594}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[8];
+    int32_t expect[8];
+} test_mm256_mullo_epi32_data_model;
+static test_mm256_mullo_epi32_data_model g_test_mm256_mullo_epi32_data = {
+    {-2147483647, 2147483645, -27471, -26305, -243156, -3887, 15196, 23902},
+    {8, -10, 12429, -7712, 13030, -10226, 193422, -13707},
+    {8, 30, -341437059, 202864160, 1126644616, 39748462, -1355726584, -327624714}};
+
+typedef struct {
+    int64_t a[4];
+    int64_t b[4];
+    int64_t expect[4];
+} test_mm256_mullo_epi64_data_model;
+static test_mm256_mullo_epi64_data_model g_test_mm256_mullo_epi64_data = {
+    {-9223372036854775805, 9223372036854775804, -274877871, -2654305},
+    {20, -10, 1242954575787, -7744412},
+    {60, 40, 8827429858442771227, 20556031493660}};
+
+typedef struct {
+    int16_t a[32];
+    int16_t b[32];
+    int16_t expect[32];
+} test_mm512_mullo_epi16_data_model;
+static test_mm512_mullo_epi16_data_model g_test_mm512_mullo_epi16_data = {
+    {-18859, 32767, -27471, -26305, 32764,  -3887,  15196, 23902,  -20918, -32764, 21522,
+     -32031, 9623,  26035,  -12362, -29218, 1563,   21128, 3676,   31916,  7853,   -30705,
+     -18084, 9343,  -275,   -16867, -16319, -25163, 6253,  -12397, 27971,  -4561},
+    {7232,  -2,     12429, -7712,  8,      -10226, 19422,  -13707, 6528,  8,     20793,
+     19126, -10386, -5701, 27098,  1547,   24015,  6860,   27591,  12289, -4738, -9718,
+     25965, -6234,  -4345, -11521, -16129, 26072,  -31896, 26659,  -8847, -3234},
+    {-7872, 2,     5501,  30240,  -32,   -31890, 28104,  -10250, 24320,  32,    27138,
+     5622,  -2078, 13505, -30980, 19594, -16683, -27552, -25212, -17236, 16934, 5782,
+     14380, 17242, 15227, 10467,  16575, 31160,  -19640, 6425,   4499,   4674}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_mullo_epi32_data_model;
+static test_mm512_mullo_epi32_data_model g_test_mm512_mullo_epi32_data = {
+    {-2147483647, 2147483645, -27471, -26305, -243156, -3887, 15196, 23902, -20918, 190380, 21522, -32031, 96233,
+     260335, -12362, -292318},
+    {8, -10, 12429, -7712, 13030, -10226, 193422, -13707, 6528, 22135, 20793, 191326, -10386, -5701, 27098, 15347},
+    {8, 30, -341437059, 202864160, 1126644616, 39748462, -1355726584, -327624714, -136552704, -80905996, 447506946,
+     -1833395810, -999475938, -1484169835, -334985476, -191237050}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[8];
+    int64_t expect[8];
+} test_mm512_mullo_epi64_data_model;
+static test_mm512_mullo_epi64_data_model g_test_mm512_mullo_epi64_data = {
+    {-9223372036854775805, 9223372036854775804, -274877871, -2654305, -2431275156, -3845487, 1587196, 2390272},
+    {20, -10, 1242954575787, -7744412, 137842030, -104557226, 1587196, -1378707},
+    {60, 40, 8827429858442771227, 20556031493660, -335131902991606680, 402073453339062, 2519191142416, -3295484738304}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[8];
+    int64_t expect[8];
+} test_mm512_mullox_epi64_data_model;
+static test_mm512_mullox_epi64_data_model g_test_mm512_mullox_epi64_data = {
+    {-9223372036854775805, 9223372036854775804, -274877871, -2654305, -2431275156, -3845487, 1587196, 2390272},
+    {20, -10, 1242954575787, -7744412, 137842030, -104557226, 1587196, -1378707},
+    {60, 40, 8827429858442771227, 20556031493660, -335131902991606680, 402073453339062, 2519191142416, -3295484738304}};
+
+typedef struct {
+    int16_t a[16];
+    int16_t b[16];
+    int16_t expect[16];
+} test_mm256_mulhrs_epi16_data_model;
+static test_mm256_mulhrs_epi16_data_model g_test_mm256_mulhrs_epi16_data = {
+    {-18859, 20464, -27471, -26305, -24156, -3887, 15196, 23902, -20918, 19080, 21522, -32031, 9623, 26035, -12362,
+     -29218},
+    {7232, -13187, 12429, -7712, 13030, -10226, 19422, -13707, 6528, 22135, 20793, 19126, -10386, -5701, 27098, 1547},
+    {-4162, -8235, -10420, 6191, -9605, 1213, 9007, -9998, -4167, 12889, 13657, -18696, -3050, -4530, -10223, -1379}};
+
+typedef struct {
+    int16_t a[32];
+    int16_t b[32];
+    int16_t expect[32];
+} test_mm512_mulhrs_epi16_data_model;
+static test_mm512_mulhrs_epi16_data_model g_test_mm512_mulhrs_epi16_data = {
+    {-18859, 20464, -27471, -26305, -24156, -3887,  15196, 23902,  -20918, 19080, 21522,
+     -32031, 9623,  26035,  -12362, -29218, 1563,   21128, 3676,   31916,  7853,  -30705,
+     -18084, 9343,  -275,   -16867, -16319, -25163, 6253,  -12397, 27971,  -4561},
+    {7232,  -13187, 12429, -7712,  13030,  -10226, 19422,  -13707, 6528,  22135, 20793,
+     19126, -10386, -5701, 27098,  1547,   24015,  6860,   27591,  12289, -4738, -9718,
+     25965, -6234,  -4345, -11521, -16129, 26072,  -31896, 26659,  -8847, -3234},
+    {-4162, -8235, -10420, 6191,  -9605, 1213, 9007,   -9998, -4167, 12889, 13657, -18696, -3050, -4530,  -10223, -1379,
+     1145,  4423,  3095,   11969, -1135, 9106, -14330, -1777, 36,    5930,  8033,  -20021, -6087, -10086, -7552,  450}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t expect[8];
+} test_mm512_bslli_epi128_data_model;
+static test_mm512_bslli_epi128_data_model g_test_mm512_bslli_epi128_data = {
+    {1, -1, 1073741824, 12, -200, -9223372036854775807, 9223372036854775807, 0},
+    {0, 1, 0, 1073741824, 0, -200, 0, 9223372036854775807}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t expect[8];
+} test_mm512_bsrli_epi128_data_model;
+static test_mm512_bsrli_epi128_data_model g_test_mm512_bsrli_epi128_data = {
+    {1, -1, 1073741824, 12, -200, -9223372036854775807, 9223372036854775807, 0},
+    {-4294967296, 4294967295, 51539607552, 0, 8589934591, 2147483648, 2147483647, 0}};
+
+typedef struct {
+    int32_t a[8];
+    int64_t b[2];
+    int32_t expect[8];
+} test_mm256_sll_epi32_data_model;
+static test_mm256_sll_epi32_data_model g_test_mm256_sll_epi32_data = {
+    {1, -1, -2147483648, 2147483647, -15, 0, 1073741824, 12}, {2, 2}, {4, -4, 0, -4, -60, 0, 0, 48}};
+
+typedef struct {
+    int64_t a[4];
+    int64_t b[2];
+    int64_t expect[4];
+} test_mm256_sll_epi64_data_model;
+static test_mm256_sll_epi64_data_model g_test_mm256_sll_epi64_data = {
+    {1, -1, 1073741824, 12}, {2, 2}, {4, -4, 4294967296, 48}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[2];
+    int64_t expect[8];
+} test_mm512_sll_epi64_data_model;
+static test_mm512_sll_epi64_data_model g_test_mm512_sll_epi64_data = {
+    {1, -1, 1073741824, 12, -200, -9223372036854775807, 9223372036854775807, 0},
+    {2, 2},
+    {4, -4, 4294967296, 48, -800, 4, -4, 0}};
+
+typedef struct {
+    int32_t a[8];
+    int b;
+    int32_t expect[8];
+} test_mm256_slli_epi32_data_model;
+static test_mm256_slli_epi32_data_model g_test_mm256_slli_epi32_data = {
+    {1, -1, -2147483648, 2147483647, -15, 0, 1073741824, 12}, 2, {4, -4, 0, -4, -60, 0, 0, 48}};
+
+typedef struct {
+    int64_t a[4];
+    int b;
+    int64_t expect[4];
+} test_mm256_slli_epi64_data_model;
+static test_mm256_slli_epi64_data_model g_test_mm256_slli_epi64_data = {
+    {1, -1, 1073741824, 12}, 2, {4, -4, 4294967296, 48}};
+
+typedef struct {
+    int64_t a[4];
+    uint64_t expect[4];
+} test_mm256_slli_si256_data_model;
+static test_mm256_slli_si256_data_model g_test_mm256_slli_si256_data = {{1, 2, 3, 4}, {65536, 131072, 196608, 262144}};
+
+typedef struct {
+    int64_t a[4];
+    uint64_t expect[4];
+} test_mm256_srli_si256_data_model;
+static test_mm256_srli_si256_data_model g_test_mm256_srli_si256_data = {{100, 200, 300, 400},
+                                                                        {56294995342131200, 0, 112589990684262400, 0}};
+
+typedef struct {
+    int64_t a[8];
+    unsigned int b;
+    int64_t expect[8];
+} test_mm512_slli_epi64_data_model;
+static test_mm512_slli_epi64_data_model g_test_mm512_slli_epi64_data = {
+    {1, -1, 1073741824, 12, -200, -9223372036854775807, 9223372036854775807, 0},
+    2,
+    {4, -4, 4294967296, 48, -800, 4, -4, 0}};
+
+typedef struct {
+    int64_t a[4];
+    int b;
+    uint64_t expect[4];
+} test_mm256_srli_epi64_data_model;
+static test_mm256_srli_epi64_data_model g_test_mm256_srli_epi64_data = {
+    {1, -1, 1073741824, 12}, 2, {0, 4611686018427387903, 268435456, 3}};
+
+typedef struct {
+    int64_t a[8];
+    unsigned int b;
+    uint64_t expect[8];
+} test_mm512_srli_epi64_data_model;
+static test_mm512_srli_epi64_data_model g_test_mm512_srli_epi64_data = {
+    {1, -1, 1073741824, 12, -200, -9223372036854775807, 9223372036854775807, 0},
+    2,
+    {0, 4611686018427387903, 268435456, 3, 4611686018427387854, 2305843009213693952, 2305843009213693951, 0}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[8];
+    int32_t expect[8];
+} test_mm256_and_si256_data_model;
+static test_mm256_and_si256_data_model g_test_mm256_and_si256_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263},
+    {-2110085158, -378158308, -728891574, 1196371477, 2038502595, 1882035512, 989786232, -1445777675},
+    {-2113927782, 155484692, -804985344, 1077939716, 1635848192, 1074364712, 139247696, 538051733}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_and_si512_data_model;
+static test_mm512_and_si512_data_model g_test_mm512_and_si512_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263, -1408379185,
+     1648597848, -1670083078, 1219322582, 1967517156, 6047237, 313749857, 238532855},
+    {-2110085158, -378158308, -728891574, 1196371477, 2038502595, 1882035512, 989786232, -1445777675, 1270439539,
+     941191503, 22951105, 2076498209, 271495372, 1471447272, 940238827, 879641860},
+    {-2113927782, 155484692, -804985344, 1077939716, 1635848192, 1074364712, 139247696, 538051733, 134829635, 536940872,
+     5506240, 1216626688, 268738756, 1310720, 268591457, 69599236}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[8];
+    int32_t expect[8];
+} test_mm256_or_si256_data_model;
+static test_mm256_or_si256_data_model g_test_mm256_or_si256_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263},
+    {-2110085158, -378158308, -728891574, 1196371477, 2038502595, 1882035512, 989786232, -1445777675},
+    {-1086392354, -276970531, -186646690, -540037315, 2109988811, 2024781178, -16843393, -1445499145},
+};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_or_si512_data_model;
+static test_mm512_or_si512_data_model g_test_mm512_or_si512_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263, -1408379185,
+     1648597848, -1670083078, 1219322582, 1967517156, 6047237, 313749857, 238532855},
+    {-2110085158, -378158308, -728891574, 1196371477, 2038502595, 1882035512, 989786232, -1445777675, 1270439539,
+     941191503, 22951105, 2076498209, 271495372, 1471447272, 940238827, 879641860},
+    {-1086392354, -276970531, -186646690, -540037315, 2109988811, 2024781178, -16843393, -1445499145, -272769281,
+     2052848479, -1652638213, 2079194103, 1970273772, 1476183789, 985397227, 1048575479}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[8];
+    int32_t expect[8];
+} test_mm256_andnot_si256_data_model;
+static test_mm256_andnot_si256_data_model g_test_mm256_andnot_si256_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263},
+    {-2110085158, -378158308, -728891574, 1196371477, 2038502595, 1882035512, 989786232, -1445777675},
+    {3842624, -533643000, 76093770, 118431761, 402654403, 807670800, 850538536, -1983829408}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_andnot_si512_data_model;
+static test_mm512_andnot_si512_data_model g_test_mm512_andnot_si512_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263, -1408379185,
+     1648597848, -1670083078, 1219322582, 1967517156, 6047237, 313749857, 238532855},
+    {-2110085158, -378158308, -728891574, 1196371477, 2038502595, 1882035512, 989786232, -1445777675, 1270439539,
+     941191503, 22951105, 2076498209, 271495372, 1471447272, 940238827, 879641860},
+    {3842624, -533643000, 76093770, 118431761, 402654403, 807670800, 850538536, -1983829408, 1135609904, 404250631,
+     17444865, 859871521, 2756616, 1470136552, 671647370, 810042624}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[8];
+    int32_t expect[8];
+} test_mm256_xor_si256_data_model;
+static test_mm256_xor_si256_data_model g_test_mm256_xor_si256_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263},
+    {-2110085158, -378158308, -728891574, 1196371477, 2038502595, 1882035512, 989786232, -1445777675},
+    {1027535428, -432455223, 618338654, -1617977031, 474140619, 950416466, -156091089, -1983550878}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_xor_si512_data_model;
+static test_mm512_xor_si512_data_model g_test_mm512_xor_si512_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263, -1408379185,
+     1648597848, -1670083078, 1219322582, 1967517156, 6047237, 313749857, 238532855},
+    {-2110085158, -378158308, -728891574, 1196371477, 2038502595, 1882035512, 989786232, -1445777675, 1270439539,
+     941191503, 22951105, 2076498209, 271495372, 1471447272, 940238827, 879641860},
+    {1027535428, -432455223, 618338654, -1617977031, 474140619, 950416466, -156091089, -1983550878, -407598916,
+     1515907607, -1658144453, 862567415, 1701535016, 1474873069, 716805770, 978976243}};
+
+typedef struct {
+    uint32_t a[8];
+    uint32_t b[8];
+    uint32_t expect[8];
+} test_mm256_or_ps_data_model;
+static test_mm256_or_ps_data_model g_test_mm256_or_ps_data = {
+    {0xbf601831, 0xbf680b7a, 0x3f35eacf, 0xbe80965d, 0x3ce2e6f0, 0x3f6bcfbc, 0xbf59f945, 0xbf0fee59},
+    {0xbf40bf7f, 0xbdbba322, 0x3f4fdfcc, 0xbf67bf79, 0x3e9c4cd3, 0x3f2efec2, 0xbef6d79c, 0xbf168421},
+    {0xbf60bf7f, 0xbffbab7a, 0x3f7fffcf, 0xbfe7bf7d, 0x3efeeef3, 0x3f6ffffe, 0xbfffffdd, 0xbf1fee79}};
+
+typedef struct {
+    uint64_t a[4];
+    uint64_t b[4];
+    uint64_t expect[4];
+} test_mm256_or_pd_data_model;
+static test_mm256_or_pd_data_model g_test_mm256_or_pd_data = {
+    {0xbfe0dd0413aa2020, 0xbfb9cf2e3e2a2300, 0xbfe20b2c4e51a2a0, 0xbfc1450d464b2b80},
+    {0xbfb06dc4ae637800, 0xbfdf52af25d8bc80, 0xbfe3bba35c5dff80, 0x3fbc54a7222d1600},
+    {0xbff0fdc4bfeb7820, 0xbfffdfaf3ffabf80, 0xbfe3bbaf5e5dffa0, 0xbffd55af666f3f80}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_and_epi32_data_model;
+static test_mm512_and_epi32_data_model g_test_mm512_and_epi32_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263, -1408379185,
+     1648597848, -1670083078, 1219322582, 1967517156, 6047237, 313749857, 238532855},
+    {-2110085158, -378158308, -728891574, 1196371477, 2038502595, 1882035512, 989786232, -1445777675, 1270439539,
+     941191503, 22951105, 2076498209, 271495372, 1471447272, 940238827, 879641860},
+    {-2113927782, 155484692, -804985344, 1077939716, 1635848192, 1074364712, 139247696, 538051733, 134829635, 536940872,
+     5506240, 1216626688, 268738756, 1310720, 268591457, 69599236}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[8];
+    int64_t expect[8];
+} test_mm512_and_epi64_data_model;
+static test_mm512_and_epi64_data_model g_test_mm512_and_epi64_data = {
+    {1878421480, 1727611854, -1472180838, 1723835162, 167794177, 1854741464, 725992112, 1995068600},
+    {-1292353920, -1179017088, 641699542, 1841719168, 749458416, -84593152, 729036003, 611309623},
+    {586165888, 548995200, 536874130, 1686512384, 134238720, 1787105792, 725876896, 610947120}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    int32_t expect[16];
+} test_mm512_or_epi32_data_model;
+static test_mm512_or_epi32_data_model g_test_mm512_or_epi32_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263, -1408379185,
+     1648597848, -1670083078, 1219322582, 1967517156, 6047237, 313749857, 238532855},
+    {-2110085158, -378158308, -728891574, 1196371477, 2038502595, 1882035512, 989786232, -1445777675, 1270439539,
+     941191503, 22951105, 2076498209, 271495372, 1471447272, 940238827, 879641860},
+    {-1086392354, -276970531, -186646690, -540037315, 2109988811, 2024781178, -16843393, -1445499145, -272769281,
+     2052848479, -1652638213, 2079194103, 1970273772, 1476183789, 985397227, 1048575479}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[8];
+    int64_t expect[8];
+} test_mm512_or_epi64_data_model;
+static test_mm512_or_epi64_data_model g_test_mm512_or_epi64_data = {
+    {-879237352, 558970432, 1710307724, 1923286300, 1125580396, 1898572736, -2042668115, -1960797766},
+    {574262164, 2123305894, -650130336, -395257666, 451452356, -622573152, -1741286838, 837851923},
+    {-339742820, 2145335270, -33554964, -84681282, 1543438316, -68293664, -1640014865, -1141859397}};
+
+typedef struct {
+    uint32_t a[16];
+    uint32_t b[16];
+    uint32_t expect[16];
+} test_mm512_xor_ps_data_model;
+static test_mm512_xor_ps_data_model g_test_mm512_xor_ps_data = {
+    {0x3f04dfd0, 0xbf0ca03a, 0x3e99dfff, 0xbe9fa46d, 0xbf5a4956, 0x3f130dc8, 0x3eeac2da, 0x3dcdbf27, 0x3f7a7645,
+     0xbf7358f6, 0x3ec4ffa2, 0x3f4dc74f, 0x3ec585cf, 0xbf552561, 0x3e56d871, 0xbe47a1a3},
+    {0x3efa8f95, 0x3d6d025d, 0x3f5e91a7, 0xbf39ba47, 0xbcd27fd1, 0x3ef88953, 0x3ef93c14, 0x3ee5ebda, 0x3f5f188e,
+     0x3e7b90ad, 0xbeeeb782, 0x3e1349c5, 0xbf61015f, 0xbf1c4a25, 0xbf4ed9fc, 0x3efdb202},
+    {0x1fe5045, 0x8261a267, 0x1c74e58, 0x1a61e2a, 0x3883687, 0x1eb849b, 0x13fece, 0x32854fd, 0x256ecb, 0x8108c85b,
+     0x802a4820, 0x15e8e8a, 0x81a48490, 0x496f44, 0x8118018d, 0x80ba13a1}};
+
+typedef struct {
+    uint64_t a[8];
+    uint64_t b[8];
+    uint64_t expect[8];
+} test_mm512_xor_pd_data_model;
+static test_mm512_xor_pd_data_model g_test_mm512_xor_pd_data = {
+    {0x3fdd199ba8144040, 0xbfe9d9296f254460, 0x3f819d6d2c68a800, 0x3fdac8bc1de595c0, 0x3fed2cedb73ecd20,
+     0xbfc68a0a3a08e580, 0x3fb0b2e89b20fd00, 0x3fb2b64df8507f00},
+    {0x3fc707d6fab1bc00, 0x3fd79c0c2d98bc80, 0xbfe88c5e333dff80, 0xbfd50f378c4b4580, 0xbfe119404b881800,
+     0x3fb7526e8bf57a00, 0x3fdcedce3b167100, 0x3fe15bf4440c03c0},
+    {0x1a1e4d52a5fc40, 0x803e452542bdf8e0, 0x806911331f555780, 0x800fc78b91aed040, 0x800c35adfcb6d520, 
+     0x8071d864b1fd9f80, 0x6c5f26a0368c00, 0x53edb9bc5c7cc0}};
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    uint8_t expect[32];
+} test_mm256_cmpeq_epi8_data_model;
+static test_mm256_cmpeq_epi8_data_model g_test_mm256_cmpeq_epi8_data = {
+    {-128, -31, -30, -29, -16, -15, -10, -9, -8, -7,  -6,  -5,  -4,  -3,  -2,  -1,
+     0,    1,   2,   3,   15,  16,  17,  32, 48, 120, 121, 122, 123, 124, 125, 127},
+    {-128, -31, 1, 34, 12, -33, -10, -9,  7,  4,  127, 11,   23,   111, -2, -1,
+     0,    1,   1, 90, 90, 22,  23,  -54, 10, 48, 120, -121, -120, 77,  22, 127},
+    {255, 255, 0, 0, 0, 0, 255, 255, 0, 0, 0, 0, 0, 0, 255, 255, 255, 255, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 255}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[8];
+    uint32_t expect[8];
+} test_mm256_cmpeq_epi32_data_model;
+static test_mm256_cmpeq_epi32_data_model g_test_mm256_cmpeq_epi32_data = {
+    {2147483647, -2, 48792, 789, 74185, 32, 0, 23},
+    {2147483647, 2, 48792, 1236589, -7895, 32, 15, 23},
+    {4294967295, 0, 4294967295, 0, 0, 4294967295, 0, 4294967295}};
+
+typedef struct {
+    int64_t a[2];
+    int64_t b[2];
+    uint64_t expect[2];
+} test_mm_cmpeq_epi64_data_model;
+static test_mm_cmpeq_epi64_data_model g_test_mm_cmpeq_epi64_data = {{6, 7}, {60, -9}, {0, 0}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[16];
+    __mmask16 expect[8];
+} test_mm512_cmp_epi32_mask_data_model;
+static test_mm512_cmp_epi32_mask_data_model g_test_mm512_cmp_epi32_mask_data = {
+    {1, 12, 18, -3, 45, 100, -20, -100, 5, 2, 67, -9, 24, 90, 55, 0},
+    {1, 13, 10, -4, 45, 90, -202, 23, 105, 0, 67, -8, 24, 89, 53, 1},
+    {5137, 35202, 40339, 0, 60398, 30333, 25196, 65535}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    __mmask64 expect[8];
+} test_mm512_cmp_epi8_mask_data_model;
+static test_mm512_cmp_epi8_mask_data_model g_test_mm512_cmp_epi8_mask_data = {
+    {-94, -95, -110, -22,  -37, 120, 91,  93,   -74,  123, 19,  -9, -50, -101, 50,  99,   -50,  82,  22,   0,  57,  -11,
+     116, 111, -102, -105, 93,  96,  -75, 36,   -101, 1,   99,  0,  -65, 68,   -4,  -108, 1,    -93, 67,   1,  -97, 62,
+     21,  97,  -112, -12,  -68, 18,  -71, -117, -113, 7,   -97, 0,  -1,  -118, 105, 91,   -123, -66, -101, -64},
+    {23,  25,  -61, 113, -122, 66,  -24,  -117, -19,  16,  -8,   66,   -50,  -92, -81, 58,
+     -91, -14, 121, 101, -60,  -60, -122, 111,  5,    75,  -102, -125, -6,   110, 18,  40,
+     59,  0,   66,  10,  -21,  6,   63,   -124, -101, -1,  100,  -71,  -65,  -26, 108, 94,
+     -2,  89,  106, -23, -100, 18,  -116, 57,   -39,  -33, 1,    107,  -123, 54,  70,  -17},
+    {1152921513205174272ULL, 16915454663280306447ULL, 18068376176485480719ULL, 0, 17293822560504377343ULL,
+     1531289410429245168ULL, 378367897224070896ULL, 18446744073709551615ULL}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    __mmask64 expect;
+} test_mm512_cmpeq_epi8_mask_data_model;
+static test_mm512_cmpeq_epi8_mask_data_model g_test_mm512_cmpeq_epi8_mask_data = {
+    {-94, -95, -110, -22,  -37, 120, 91,  93,   -74,  123, 19,  -9, -50, -101, 50,  99,   -50,  82,  22,   0,  57,  -11,
+     116, 111, -102, -105, 93,  96,  -75, 36,   -101, 1,   99,  0,  -65, 68,   -4,  -108, 1,    -93, 67,   1,  -97, 62,
+     21,  97,  -112, -12,  -68, 18,  -71, -117, -113, 7,   -97, 0,  -1,  -118, 105, 91,   -123, -66, -101, -64},
+    {23,  25,  -61, 113, -122, 66,  -24,  -117, -19,  16,  -8,   66,   -50,  -92, -81, 58,
+     -91, -14, 121, 101, -60,  -60, -122, 111,  5,    75,  -102, -125, -6,   110, 18,  40,
+     59,  0,   66,  10,  -21,  6,   63,   -124, -101, -1,  100,  -71,  -65,  -26, 108, 94,
+     -2,  89,  106, -23, -100, 18,  -116, 57,   -39,  -33, 1,    107,  -123, 54,  70,  -17},
+    1152921513205174272};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    __mmask64 k1;
+    __mmask64 expect;
+} test_mm512_mask_cmpeq_epi8_mask_data_model;
+static test_mm512_mask_cmpeq_epi8_mask_data_model g_test_mm512_mask_cmpeq_epi8_mask_data = {
+    {-94, -95, -110, -22,  -37, 120, 91,  93,   -74,  123, 19,  -9, -50, -101, 50,  99,   -50,  82,  22,   0,  57,  -11,
+     116, 111, -102, -105, 93,  96,  -75, 36,   -101, 1,   99,  0,  -65, 68,   -4,  -108, 1,    -93, 67,   1,  -97, 62,
+     21,  97,  -112, -12,  -68, 18,  -71, -117, -113, 7,   -97, 0,  -1,  -118, 105, 91,   -123, -66, -101, -64},
+    {23,  25,  -61, 113, -122, 66,  -24,  -117, -19,  16,  -8,   66,   -50,  -92, -81, 58,
+     -91, -14, 121, 101, -60,  -60, -122, 111,  5,    75,  -102, -125, -6,   110, 18,  40,
+     59,  0,   66,  10,  -21,  6,   63,   -124, -101, -1,  100,  -71,  -65,  -26, 108, 94,
+     -2,  89,  106, -23, -100, 18,  -116, 57,   -39,  -33, 1,    107,  -123, 54,  70,  -17},
+    2120728348,
+    4096};
+
+typedef struct {
+    int32_t a[16];
+    int32_t expect[16];
+} test_mm512_set_epi32_data_model;
+static test_mm512_set_epi32_data_model g_test_mm512_set_epi32_data = {
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263, -1408379185,
+     1648597848, -1670083078, 1219322582, 1967517156, 6047237, 313749857, 238532855},
+    {-1090234978, 256672469, -262740460, -658469076, 1707334408, 1217110378, -867381929, 538330263, -1408379185,
+     1648597848, -1670083078, 1219322582, 1967517156, 6047237, 313749857, 238532855}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t expect[8];
+} test_mm512_set_epi64_data_model;
+static test_mm512_set_epi64_data_model g_test_mm512_set_epi64_data = {
+    {-9223372036854775805, 9223372036854775804, -274877871, -2654305, -2431275156, -3845487, 1587196, 2390272},
+    {-9223372036854775805, 9223372036854775804, -274877871, -2654305, -2431275156, -3845487, 1587196, 2390272}};
+
+typedef struct {
+    int32_t a;
+    int32_t expect[16];
+} test_mm512_set1_epi32_data_model;
+static test_mm512_set1_epi32_data_model g_test_mm512_set1_epi32_data = {
+    -1090234978,
+    {-1090234978, -1090234978, -1090234978, -1090234978, -1090234978, -1090234978, -1090234978, -1090234978,
+     -1090234978, -1090234978, -1090234978, -1090234978, -1090234978, -1090234978, -1090234978, -1090234978}};
+
+typedef struct {
+    int64_t a;
+    int64_t expect[8];
+} test_mm512_set1_epi64_data_model;
+static test_mm512_set1_epi64_data_model g_test_mm512_set1_epi64_data = {
+    -9223372036854775805,
+    {-9223372036854775805, -9223372036854775805, -9223372036854775805, -9223372036854775805, -9223372036854775805,
+     -9223372036854775805, -9223372036854775805, -9223372036854775805}};
+
+typedef struct {
+    char a;
+    int8_t expect[64];
+} test_mm512_set1_epi8_data_model;
+static test_mm512_set1_epi8_data_model g_test_mm512_set1_epi8_data = {
+    (char)-43,
+    {-43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43,
+     -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43,
+     -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43, -43}};
+
+typedef struct {
+    float32_t a[16];
+    float32_t expect[16];
+} test_mm512_set_ps_data_model;
+static test_mm512_set_ps_data_model g_test_mm512_set_ps_data = {
+    {1.550000, 127.245003, 8.000000, -9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480, 16.145000,
+     20.000999, 58.698002, 57.146999, 67.156998, 30.124500, 40.658001, 14.650000},
+    {1.550000, 127.245003, 8.000000, -9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480, 16.145000,
+     20.000999, 58.698002, 57.146999, 67.156998, 30.124500, 40.658001, 14.650000}};
+
+typedef struct {
+    float64_t a[8];
+    float64_t expect[8];
+} test_mm512_set_pd_data_model;
+static test_mm512_set_pd_data_model g_test_mm512_set_pd_data = {
+    {895828870.5853160620, 13198299.6345069464, -46353094.4883757681, 1008132723.0642696619, 1132533339.7273423672,
+     190439947.2382000089, -1511701151.4810729027, 446683041.9176963568},
+    {895828870.5853160620, 13198299.6345069464, -46353094.4883757681, 1008132723.0642696619, 1132533339.7273423672,
+     190439947.2382000089, -1511701151.4810729027, 446683041.9176963568}};
+
+typedef struct {
+    float32_t a;
+    float32_t expect[16];
+} test_mm512_set1_ps_data_model;
+static test_mm512_set1_ps_data_model g_test_mm512_set1_ps_data = {
+    58.698002,
+    {58.698002, 58.698002, 58.698002, 58.698002, 58.698002, 58.698002, 58.698002, 58.698002, 58.698002, 58.698002,
+     58.698002, 58.698002, 58.698002, 58.698002, 58.698002, 58.698002}};
+
+typedef struct {
+    float64_t a;
+    float64_t expect[8];
+} test_mm512_set1_pd_data_model;
+static test_mm512_set1_pd_data_model g_test_mm512_set1_pd_data = {
+    -1511701151.4810729027,
+    {-1511701151.4810729027, -1511701151.4810729027, -1511701151.4810729027, -1511701151.4810729027,
+     -1511701151.4810729027, -1511701151.4810729027, -1511701151.4810729027, -1511701151.4810729027}};
+
+typedef struct {
+    float32_t expect[16];
+} test_mm512_setzero_ps_data_model;
+static test_mm512_setzero_ps_data_model g_test_mm512_setzero_ps_data = {
+    {0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}};
+
+typedef struct {
+    float64_t expect[8];
+} test_mm512_setzero_pd_data_model;
+static test_mm512_setzero_pd_data_model g_test_mm512_setzero_pd_data = {{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}};
+
+typedef struct {
+    float32_t a[4];
+    float32_t b[4];
+    float32_t expect[4];
+} test_mm_move_ss_data_model;
+static test_mm_move_ss_data_model g_test_mm_move_ss_data = {
+    {1.0, 1.0, 1.0, 1.0}, {2.0, 2.0, 2.0, 2.0}, {2.0, 1.0, 1.0, 1.0}};
+
+typedef struct {
+    float64_t a[2];
+    float64_t b[2];
+    float64_t expect[2];
+} test_mm_move_sd_data_model;
+static test_mm_move_sd_data_model g_test_mm_move_sd_data = {{1.0, 1.0}, {2.0, 2.0}, {2.0, 1.0}};
+
+typedef struct {
+    int8_t a[32];
+    int expect;
+} test_mm256_movemask_epi8_data_model;
+static test_mm256_movemask_epi8_data_model g_test_mm256_movemask_epi8_data = {
+    {-2, 30, 127, 100, 4,   8, 10, -43, -56, 102, 120, 70, 45, -12, 20, 27,
+     1,  -4, 18,  50,  -49, 7, 0,  80,  8,   6,   -7,  15, 0,  9,   11, -6},
+    (int)0x84122181};
+
+typedef struct {
+    float32_t a[8];
+    int expect;
+} test_mm256_movemask_ps_data_model;
+static test_mm256_movemask_ps_data_model g_test_mm256_movemask_ps_data = {
+    {1.500000, 127.199997, 8.000000, -9.562000, -100.000000, -128.845703, 1045481.000000, 7.125480}, 0x00000038};
+
+typedef struct {
+    int8_t a[16];
+    int8_t b[16];
+    int expect;
+} test_mm_testz_si128_data_model;
+static test_mm_testz_si128_data_model g_test_mm_testz_si128_data = {
+    {-2, 30, 127, 100, 4, 8, 10, -43, -56, 102, 120, 70, 45, -12, 20, 27},
+    {1, -31, -128, -101, -5, -9, -11, 42, 55, -103, -121, -71, -46, 11, -21, -28},
+    1};
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    int8_t expect;
+} test_mm256_testz_si256_data_model;
+static test_mm256_testz_si256_data_model g_test_mm256_testz_si256_data = {
+    {107, -45, 58, -11, 46, 84,  2,  91, 38,   -92, 6,  -5,  -33, 14, -96,  -19,
+     90,  5,   52, 45,  70, -44, 45, 24, -109, 30,  44, 108, -17, 22, -122, 27},
+    {-108, 44, -59, 10,  -47, -85, -3,  -92, -39, 91,  -7,  4,    32, -15, 95,  18,
+     -91,  -6, -53, -46, -71, 43,  -46, -25, 108, -31, -45, -109, 16, -23, 121, -28},
+    1};
+
+typedef struct {
+    unsigned long long a;
+    int8_t expect[64];
+} test_mm512_movm_epi8_data_model;
+static test_mm512_movm_epi8_data_model g_test_mm512_movm_epi8_data = {
+    0x123456789ABCDEF0, {0,  0,  0,  0,  -1, -1, -1, -1, 0,  -1, -1, -1, -1, 0,  -1, -1, 0,  0, -1, -1, -1, -1,
+                         0,  -1, 0,  -1, 0,  -1, -1, 0,  0,  -1, 0,  0,  0,  -1, -1, -1, -1, 0, 0,  -1, -1, 0,
+                         -1, 0,  -1, 0,  0,  0,  -1, 0,  -1, -1, 0,  0,  0,  -1, 0,  0,  -1, 0, 0,  0}};
+
+typedef struct {
+    int32_t a[4];
+    const int b;
+    int32_t expect;
+} test_mm_extract_epi32_data_model;
+static test_mm_extract_epi32_data_model g_test_mm_extract_epi32_data = {{13, 25, 31, 47}, 3, 47};
+
+typedef struct {
+    int64_t a[2];
+    const int b;
+    int64_t expect;
+} test_mm_extract_epi64_data_model;
+static test_mm_extract_epi64_data_model g_test_mm_extract_epi64_data = {
+    {6776653639822200585, 1668641392784102205}, 0, 6776653639822200585};
+
+typedef struct {
+    int64_t a[4];
+    const int b;
+    int64_t expect[2];
+} test_mm256_extracti128_si256_data_model;
+static test_mm256_extracti128_si256_data_model g_test_mm256_extracti128_si256_data = {
+    {-15, 978, 16, 654}, 1, {16, 654}};
+
+typedef struct {
+    float32_t a[4];
+    const int b;
+    int expect;
+} test_mm_extract_ps_data_model;
+static test_mm_extract_ps_data_model g_test_mm_extract_ps_data = {{1.0, 2.0, 3.0, 4.0}, 3, 1082130432};
+
+typedef struct {
+    int32_t a[8];
+    const int b;
+    int expect;
+} test_mm256_extract_epi32_data_model;
+static test_mm256_extract_epi32_data_model g_test_mm256_extract_epi32_data = {{1, 2, 3, 4, 5, 6, 7, 8}, 3, 4};
+
+typedef struct {
+    int64_t a[4];
+    const int b;
+    int64_t expect;
+} test_mm256_extract_epi64_data_model;
+static test_mm256_extract_epi64_data_model g_test_mm256_extract_epi64_data = {{1, 2, 3, 4}, 2, 3};
+
+typedef struct {
+    float32_t a[8];
+    const int b;
+    float32_t expect[4];
+} test_mm256_extractf128_ps_data_model;
+static test_mm256_extractf128_ps_data_model g_test_mm256_extractf128_ps_data = {
+    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, 1, {5.0, 6.0, 7.0, 8.0}};
+
+typedef struct {
+    float64_t a[4];
+    const int b;
+    float64_t expect[2];
+} test_mm256_extractf128_pd_data_model;
+static test_mm256_extractf128_pd_data_model g_test_mm256_extractf128_pd_data = {{1.0, 2.0, 3.0, 4.0}, 1, {3.0, 4.0}};
+
+typedef struct {
+    float32_t a[16];
+    const int b;
+    float32_t expect[8];
+} test_mm512_extractf32x8_ps_data_model;
+static test_mm512_extractf32x8_ps_data_model g_test_mm512_extractf32x8_ps_data = {
+    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, -1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0},
+    1,
+    {-1.0, -2.0, -3.0, -4.0, -5.0, -6.0, -7.0, -8.0}};
+
+typedef struct {
+    float64_t a[8];
+    const int b;
+    float64_t expect[4];
+} test_mm512_extractf64x4_pd_data_model;
+static test_mm512_extractf64x4_pd_data_model g_test_mm512_extractf64x4_pd_data = {
+    {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}, 1, {5.0, 6.0, 7.0, 8.0}};
+
+typedef struct {
+    unsigned int crc;
+    unsigned char v;
+    unsigned int expect;
+} test_mm_crc32_u8_data_model;
+static test_mm_crc32_u8_data_model g_test_mm_crc32_u8_data = {4153545204, 155, 1059208535};
+
+typedef struct {
+    unsigned int crc;
+    unsigned short v;
+    unsigned int expect;
+} test_mm_crc32_u16_data_model;
+static test_mm_crc32_u16_data_model g_test_mm_crc32_u16_data = {3730857934, 26977, 1970496319};
+
+typedef struct {
+    unsigned int crc;
+    unsigned int v;
+    unsigned int expect;
+} test_mm_crc32_u32_data_model;
+static test_mm_crc32_u32_data_model g_test_mm_crc32_u32_data = {4195592487, 4146244042, 2462366937
+
+};
+
+typedef struct {
+    unsigned long crc;
+    unsigned long v;
+    unsigned long expect;
+} test_mm_crc32_u64_data_model;
+static test_mm_crc32_u64_data_model g_test_mm_crc32_u64_data = {4257207552971783972, 4094426903918957320, 3846645531};
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    int8_t expect[32];
+} test_mm256_shuffle_epi8_data_model;
+static test_mm256_shuffle_epi8_data_model g_test_mm256_shuffle_epi8_data = {
+    {-111, 14, 91, 125, -59,  66, 97,  -83, 21, 62, 42,  -41, -37,  26, 63,   -1,
+     38,   66, -9, -2,  -109, 9,  -81, -92, 19, 87, -16, -61, -108, 73, -120, -109},
+    {111, -60, 20, -117, -77, -122, -80, -75, 65,  -19, -44, 20, 88,  21,  106, 1,
+     -70, -48, 66, 59,   42,  -128, -61, -85, 108, 58,  124, 72, -89, -91, -40, 100},
+    {-1, 0, -59, 0,   0,   0, 0, 0, 14,   0,   0,    -59, 21, 66, 42, 14,
+     0,  0, -9,  -61, -16, 0, 0, 0, -108, -16, -108, 19,  0,  0,  0,  -109}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    int8_t expect[64];
+} test_mm512_shuffle_epi8_data_model;
+static test_mm512_shuffle_epi8_data_model g_test_mm512_shuffle_epi8_data = {
+    {-29,  70,  -9,   -121, 47,  -52,  74,  -83, -97, 4,   122,  -42, -97,  34,  -66,  84,
+     -43,  30,  64,   68,   64,  -54,  43,  -89, 44,  26,  -56,  -37, 71,   -97, -38,  -36,
+     -105, -85, -68,  1,    -83, -58,  -77, 72,  -27, -41, -104, -53, -51,  32,  -108, 17,
+     94,   -11, -103, 25,   30,  -111, 95,  -47, 83,  -88, -128, -21, -120, -81, -96,  22},
+    {91,  -15, 82,  -86, -10,  -62, -32, -51, -106, -36,  26,  112, -121, -76, 21,  95,   108, 63,  50,  123, -121, -89,
+     -56, -92, 113, -77, -128, 0,   94,  -89, 59,   48,   123, 16,  74,   -87, -84, -124, 50,  -91, -43, -26, -37,  -89,
+     -57, 86,  -83, 113, 55,   11,  51,  99,  -75,  -107, -93, -15, -43,  89,  69,  -82,  -84, -85, 102, 81},
+    {-42, 0,   -9, 0,   0,   0,   0,   0,  0,   0,   122, -29,  0,    0,   -52,  84, 71,  -36, 64, -37, 0, 0,
+     0,   0,   30, 0,   0,   -43, -38, 0,  -37, -43, -53, -105, -104, 0,   0,    0,  -68, 0,   0,  0,   0, 0,
+     0,   -77, 0,  -85, -47, -21, 25,  25, 0,   0,   0,   0,    0,    -88, -111, 0,  0,   0,   95, -11}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    unsigned long k;
+    int8_t expect[64];
+} test_mm512_maskz_shuffle_epi8_data_model;
+static test_mm512_maskz_shuffle_epi8_data_model g_test_mm512_maskz_shuffle_epi8_data = {
+    {-29,  70,  -9,   -121, 47,  -52,  74,  -83, -97, 4,   122,  -42, -97,  34,  -66,  84,
+     -43,  30,  64,   68,   64,  -54,  43,  -89, 44,  26,  -56,  -37, 71,   -97, -38,  -36,
+     -105, -85, -68,  1,    -83, -58,  -77, 72,  -27, -41, -104, -53, -51,  32,  -108, 17,
+     94,   -11, -103, 25,   30,  -111, 95,  -47, 83,  -88, -128, -21, -120, -81, -96,  22},
+    {91,  -15, 82,  -86, -10,  -62, -32, -51, -106, -36,  26,  112, -121, -76, 21,  95,   108, 63,  50,  123, -121, -89,
+     -56, -92, 113, -77, -128, 0,   94,  -89, 59,   48,   123, 16,  74,   -87, -84, -124, 50,  -91, -43, -26, -37,  -89,
+     -57, 86,  -83, 113, 55,   11,  51,  99,  -75,  -107, -93, -15, -43,  89,  69,  -82,  -84, -85, 102, 81},
+    0x123456789ABCDEF,
+    {-42, 0, -9, 0, 0,   0,   0, 0, 0, 0,   122, -29,  0,    0, -52, 84, 71,  -36, 0, -37, 0, 0,
+     0,   0, 30, 0, 0,   -43, 0, 0, 0, -43, -53, -105, -104, 0, 0,   0,  -68, 0,   0, 0,   0, 0,
+     0,   0, 0,  0, -47, -21, 0, 0, 0, 0,   0,   0,    0,    0, 0,   0,  0,   0,   0, 0}};
+
+typedef struct {
+    uint8_t a[32];
+    uint8_t b[32];
+    uint8_t expect[32];
+} test_mm256_multishift_epi64_epi8_data_model;
+static test_mm256_multishift_epi64_epi8_data_model g_test_mm256_multishift_epi64_epi8_data = {
+    {21, 150, 198, 156, 122, 182, 234, 69, 123, 2, 66, 128, 182, 84, 184, 14, 243, 137, 0, 176, 235, 16, 87, 195, 1,
+     136, 113, 107, 157, 191, 133, 34},
+    {46, 188, 91, 246, 197, 149, 241, 238, 171, 181, 172, 40, 148, 152, 197, 205, 165, 186, 38, 122, 102, 213, 85, 211,
+     197, 67, 8, 39, 191, 85, 205, 120},
+    {178, 217, 240, 95, 187, 187, 101, 225, 121, 106, 106, 171, 55, 138, 205, 178, 106, 93, 165, 85, 186, 38, 244, 84,
+     226, 67, 102, 170, 249, 138, 30, 111}};
+
+typedef struct {
+    uint8_t a[64];
+    uint8_t b[64];
+    uint8_t expect[64];
+} test_mm512_multishift_epi64_epi8_data_model;
+static test_mm512_multishift_epi64_epi8_data_model g_test_mm512_multishift_epi64_epi8_data = {
+    {117, 16, 56, 169, 126, 165, 25, 50, 198, 169, 245, 191, 100, 211, 206, 22, 121, 2, 16, 36, 159, 107, 161, 194, 104,
+     169, 165, 194, 191, 98, 152, 209, 159, 194, 127, 61, 177, 214, 70, 185, 3, 11, 204, 178, 98, 25, 141, 11, 189, 219,
+     181, 90, 194, 184, 200, 183, 212, 117, 144, 44, 60, 237, 47, 246},
+    {118, 191, 233, 115, 243, 221, 215, 10, 202, 66, 58, 101, 86, 89, 3, 67, 38, 177, 171, 142, 202, 184, 157, 103, 107,
+     93, 105, 9, 149, 141, 120, 17, 211, 175, 231, 30, 169, 82, 152, 174, 99, 169, 205, 139, 21, 250, 234, 44, 206, 60,
+     25, 102, 48, 8, 113, 204, 194, 160, 39, 242, 69, 202, 248, 236},
+    {86, 233, 10, 238, 216, 239, 185, 181, 11, 172, 24, 148, 149, 167, 233, 148, 51, 73, 171, 140, 149, 183, 101, 73,
+     141, 70, 108, 90, 214, 101, 9, 180, 82, 244, 167, 157, 76, 123, 191, 215, 44, 181, 218, 58, 133, 197, 109, 181,
+     118, 12, 99, 25, 51, 204, 60, 152, 34, 103, 39, 140, 46, 198, 241, 179}};
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    int8_t expect[32];
+} test_mm256_unpacklo_epi8_data_model;
+static test_mm256_unpacklo_epi8_data_model g_test_mm256_unpacklo_epi8_data = {
+    {-59,  -99, 96,   41,   74, 33,  -117, -24, 65, -4,  -28, -20,  28,  65, 91,  -66,
+     -100, 87,  -100, -118, 59, -41, -71,  20,  -5, -52, 126, -113, 110, 53, -22, -10},
+    {77, -61, -72, 31,  -39, -63, 77, -15,  72,  54, -119, 101,  84,   27, -23, 21,
+     -2, 51,  49,  -18, -57, -47, 27, -109, -86, 39, -103, -121, -110, 14, 124, 50},
+    {-59,  77, -99, -61, 96,   -72, 41,   31,  74, -39, 33,  -63, -117, 77, -24, -15,
+     -100, -2, 87,  51,  -100, 49,  -118, -18, 59, -57, -41, -47, -71,  27, 20,  -109}};
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    int8_t expect[32];
+} test_mm256_unpackhi_epi8_data_model;
+static test_mm256_unpackhi_epi8_data_model g_test_mm256_unpackhi_epi8_data = {
+    {-51, -23, 2,  -55, 96, -45,  17, 37,  84,  -116, -126, -62, -69, 9,   60,  40,
+     99,  50,  80, 97,  4,  -109, 35, -87, -91, -99,  12,   55,  22,  123, -76, 87},
+    {-66,  93,  103, 34,  105, 42,  81,  45, 46, -107, 114, -7,  31,  65, 23,  -9,
+     -109, -10, 40,  123, -62, 120, -85, 65, 81, 91,   -90, -94, -75, 93, -20, 67},
+    {84,  46, -116, -107, -126, 114, -62, -7,  -69, 31,  9,   65, 60,  23,  40, -9,
+     -91, 81, -99,  91,   12,   -90, 55,  -94, 22,  -75, 123, 93, -76, -20, 87, 67}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    int8_t expect[64];
+} test_mm512_unpacklo_epi8_data_model;
+static test_mm512_unpacklo_epi8_data_model g_test_mm512_unpacklo_epi8_data = {
+    {-3, 0,   93, 51,   -111, -78, -27, -94, -21,  10,  -88,  81,  -22, -95, -77, 27,   -63, 34,  105, 21, -77, -45,
+     8,  -66, 88, -100, 51,   119, 65,  -55, 62,   114, -64,  -10, 101, 106, 125, 117,  63,  -21, -28, 46, -70, -31,
+     11, 32,  41, 88,   52,   110, 109, 62,  -125, -38, -124, -76, 22,  -38, -56, -100, 64,  -55, 6,   81},
+    {110,  107, -26, 76,  61,  -100, 28,  26,   7,    -57, 85,  111, -85, 113, -21,  67,  -99, 3,   -21, 78, 92, 3,
+     -116, 43,  -70, -20, -58, 77,   -88, -43,  -122, 111, 92,  85,  56,  -4,  -37,  91,  126, 112, 24,  9,  27, 76,
+     -50,  9,   79,  -40, -96, 86,   83,  -125, 118,  118, 125, -35, -74, 99,  -109, -10, -65, -54, 76,  -27},
+    {-3, 110, 0,   107, 93,  -26, 51,  76,   -111, 61, -78, -100, -27,  28,  -94, 26,  -63,  -99, 34,  3,   105, -21,
+     21, 78,  -77, 92,  -45, 3,   8,   -116, -66,  43, -64, 92,   -10,  85,  101, 56,  106,  -4,  125, -37, 117, 91,
+     63, 126, -21, 112, 52,  -96, 110, 86,   109,  83, 62,  -125, -125, 118, -38, 118, -124, 125, -76, -35}};
+
+typedef struct {
+    int8_t a[64];
+    int8_t b[64];
+    int8_t expect[64];
+} test_mm512_unpackhi_epi8_data_model;
+static test_mm512_unpackhi_epi8_data_model g_test_mm512_unpackhi_epi8_data = {
+    {50,  36,  -102, 43,  -124, -47, -17,  -68,  -99, 17,   -40,  100,  61,   -67, 102, -81,
+     1,   68,  -100, 82,  -18,  25,  -65,  -118, 93,  -122, 23,   -126, 0,    81,  -39, -6,
+     -54, 36,  57,   62,  -107, 10,  -118, -55,  67,  115,  -12,  73,   -117, 60,  67,  -74,
+     -20, -77, -36,  -48, 26,   -12, 13,   -44,  -91, 30,   -112, 89,   -89,  -27, 116, 105},
+    {36,  -8,  -20, -81, -120, 52,  -12, -50, -18, 80,  -70, 11,   36, -105, -109, -15, 99,   4,  70,  -53, 90,  25,
+     19,  87,  -19, -85, 48,   -89, -70, 19,  64,  -22, 114, -107, 76, 24,   -70,  52,  -12,  78, -3,  41,  -28, -77,
+     -46, 120, -84, 120, -16,  -43, 100, -52, -12, 18,  -21, -79,  15, 94,   -86,  21,  -110, 69, 104, -57},
+    {-99,  -18, 17,  80,  -40, -70, 100, 11, 61,   36,  -67, -105, 102, -109, -81, -15, 93,  -19, -122, -85, 23, 48,
+     -126, -89, 0,   -70, 81,  19,  -39, 64, -6,   -22, 67,  -3,   115, 41,   -12, -28, 73,  -77, -117, -46, 60, 120,
+     67,   -84, -74, 120, -91, 15,  30,  94, -112, -86, 89,  21,   -89, -110, -27, 69,  116, 104, 105,  -57}};
+typedef struct {
+    int32_t a[8];
+    int32_t expect[8];
+} test_mm256_store_si256_data_model;
+static test_mm256_store_si256_data_model g_test_mm256_store_si256_data = {{-59, -99, 96, 41, 74, 33, -117, -24},
+                                                                          {-59, -99, 96, 41, 74, 33, -117, -24}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t expect[8];
+} test_mm256_storeu_si256_data_model;
+static test_mm256_storeu_si256_data_model g_test_mm256_storeu_si256_data = {{-59, -99, 96, 41, 74, 33, -117, -24},
+                                                                            {-59, -99, 96, 41, 74, 33, -117, -24}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t expect[16];
+} test_mm512_store_si512_data_model;
+static test_mm512_store_si512_data_model g_test_mm512_store_si512_data = {
+    {-59, -99, 96, 41, 74, 33, -117, -24, -9, 19, 6, 12, 33, 4, -11, -4},
+    {-59, -99, 96, 41, 74, 33, -117, -24, -9, 19, 6, 12, 33, 4, -11, -4}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t b[4];
+    int32_t expect[8];
+} test_mm256_inserti128_si256_data_model;
+static test_mm256_inserti128_si256_data_model g_test_mm256_inserti128_si256_data = {
+    {13, -115, 0, 400, 32, 98, 36, -78}, {466, 100, 3, -7}, {466, 100, 3, -7, 32, 98, 36, -78}};
+
+typedef struct {
+    float32_t a[8];
+    float32_t b[4];
+    int imm;
+    float32_t expect[8];
+} test_mm256_insertf128_ps_data_model;
+static test_mm256_insertf128_ps_data_model g_test_mm256_insertf128_ps_data = {
+    {1.000000, 0.000000, -23.559999, 100.199997, -345.000000, -399.450012, -888.658020, 10.220000},
+    {111.345001, -342.000000, 109.000000, 799.654724},
+    0,
+    {111.345001, -342.000000, 109.000000, 799.654724, -345.000000, -399.450012, -888.658020, 10.220000}};
+
+typedef struct {
+    float64_t a[4];
+    float64_t b[2];
+    int imm;
+    float64_t expect[4];
+} test_mm256_insertf128_pd_data_model;
+static test_mm256_insertf128_pd_data_model g_test_mm256_insertf128_pd_data = {
+    {1.000000, 0.000000, -23.560000, 100.200000},
+    {-345.000000, -399.450000},
+    0,
+    {-345.000000, -399.450000, -23.560000, 100.200000}};
+
+typedef struct {
+    int64_t a[4];
+    int imm;
+    int64_t expect[4];
+} test_mm256_permute4x64_epi64_data_model;
+static test_mm256_permute4x64_epi64_data_model g_test_mm256_permute4x64_epi64_data = {
+    {100, 200, 300, 400}, 27, {400, 300, 200, 100}};
+
+typedef struct {
+    float64_t a[2];
+    float64_t expect[2];
+} test_mm_set_pd_data_model;
+static test_mm_set_pd_data_model g_test_mm_set_pd_data = {{2.019, 11.15}, {2.019, 11.15}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t expect[8];
+} test_mm256_set_epi32_data_model;
+static test_mm256_set_epi32_data_model g_test_mm256_set_epi32_data = {
+    {1, 2, 3, 4, 5, 6, 7, 8},
+    {1, 2, 3, 4, 5, 6, 7, 8},
+};
+
+typedef struct {
+    int64_t a[8];
+    int64_t expect[8];
+} test_mm256_set_epi64x_data_model;
+static test_mm256_set_epi64x_data_model g_test_mm256_set_epi64x_data = {
+    {1LL, 2LL, 3LL, 4LL},
+    {1LL, 2LL, 3LL, 4LL},
+};
+
+typedef struct {
+    int32_t a[8];
+    int32_t expect[8];
+} test_mm256_set_m128i_data_model;
+static test_mm256_set_m128i_data_model g_test_mm256_set_m128i_data = {
+    {1, 2, 3, 4, 5, 6, 7, 8},
+    {1, 2, 3, 4, 5, 6, 7, 8},
+};
+
+typedef struct {
+    float32_t a[8];
+    float32_t expect[8];
+} test_mm256_set_ps_data_model;
+static test_mm256_set_ps_data_model g_test_mm256_set_ps_data = {
+    {2.0f, 1.9f, 1.0f, 1.0f, 1.0f, 5.0f, 1.2f, 0.8f},
+    {2.0f, 1.9f, 1.0f, 1.0f, 1.0f, 5.0f, 1.2f, 0.8f},
+};
+
+typedef struct {
+    float64_t a[4];
+    float64_t expect[4];
+} test_mm256_set_pd_data_model;
+static test_mm256_set_pd_data_model g_test_mm256_set_pd_data = {
+    {2.019, 1.1, 1.5, 1.208},
+    {2.019, 1.1, 1.5, 1.208},
+};
+
+typedef struct {
+    int32_t expect[8];
+} test_mm256_setzero_si256_data_model;
+static test_mm256_setzero_si256_data_model g_test_mm256_setzero_si256_data = {
+    {0, 0, 0, 0, 0, 0, 0, 0},
+};
+
+typedef struct {
+    float32_t expect[8];
+} test_mm256_setzero_ps_data_model;
+static test_mm256_setzero_ps_data_model g_test_mm256_setzero_ps_data = {
+    {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f},
+};
+
+typedef struct {
+    float64_t expect[4];
+} test_mm256_setzero_pd_data_model;
+static test_mm256_setzero_pd_data_model g_test_mm256_setzero_pd_data = {
+    {0.0, 0.0, 0.0, 0.0},
+};
+
+typedef struct {
+    int64_t a;
+    int64_t expect[2];
+} test_mm_set1_epi64x_data_model;
+static test_mm_set1_epi64x_data_model g_test_mm_set1_epi64x_data = {
+    2019LL,
+    {2019LL, 2019LL},
+};
+
+typedef struct {
+    float64_t a;
+    float64_t expect[2];
+} test_mm_set1_pd_data_model;
+static test_mm_set1_pd_data_model g_test_mm_set1_pd_data = {
+    2.019,
+    {2.019, 2.019},
+};
+
+typedef struct {
+    int8_t a;
+    int8_t expect[32];
+} test_mm256_set1_epi8_data_model;
+static test_mm256_set1_epi8_data_model g_test_mm256_set1_epi8_data = {
+    '0',
+    {'0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
+     '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'},
+};
+
+typedef struct {
+    int32_t a;
+    int32_t expect[8];
+} test_mm256_set1_epi32_data_model;
+static test_mm256_set1_epi32_data_model g_test_mm256_set1_epi32_data = {
+    2019,
+    {2019, 2019, 2019, 2019, 2019, 2019, 2019, 2019},
+};
+
+typedef struct {
+    int64_t a;
+    int64_t expect[4];
+} test_mm256_set1_epi64x_data_model;
+static test_mm256_set1_epi64x_data_model g_test_mm256_set1_epi64x_data = {
+    2019LL,
+    {2019LL, 2019LL, 2019LL, 2019LL},
+};
+
+typedef struct {
+    float64_t a;
+    float64_t expect[4];
+} test_mm256_set1_pd_data_model;
+static test_mm256_set1_pd_data_model g_test_mm256_set1_pd_data = {
+    2.019,
+    {2.019, 2.019, 2.019, 2.019},
+};
+
+typedef struct {
+    float32_t a;
+    float32_t expect[8];
+} test_mm256_set1_ps_data_model;
+static test_mm256_set1_ps_data_model g_test_mm256_set1_ps_data = {
+    2.019f,
+    {2.019f, 2.019f, 2.019f, 2.019f, 2.019f, 2.019f, 2.019f, 2.019f},
+};
+
+typedef struct {
+    int32_t a[8];
+    int32_t expect[8];
+} test_mm256_load_si256_data_model;
+static test_mm256_load_si256_data_model g_test_mm256_load_si256_data = {
+    {1, 2, 3, 4, 5, 6, 7, 8},
+    {1, 2, 3, 4, 5, 6, 7, 8},
+};
+
+typedef struct {
+    int32_t a[8];
+    int32_t expect[8];
+} test_mm256_loadu_si256_data_model;
+static test_mm256_loadu_si256_data_model g_test_mm256_loadu_si256_data = {
+    {1, 2, 3, 4, 5, 6, 7, 8},
+    {1, 2, 3, 4, 5, 6, 7, 8},
+};
+
+typedef struct {
+    int32_t a[8];
+    int32_t mask[8];
+    int32_t expect[8];
+} test_mm256_maskload_epi32_data_model;
+static test_mm256_maskload_epi32_data_model g_test_mm256_maskload_epi32_data = {
+    {1, 2, 3, 4, 5, 6, 7, 8},
+    {-1, -1, -1, 0, 0, 0, 0, -1},
+    {1, 2, 3, 0, 0, 0, 0, 8},
+};
+
+typedef struct {
+    int32_t __attribute__((aligned(64))) a[16];
+    int32_t __attribute__((aligned(64))) expect[16];
+} test_mm512_load_si512_data_model;
+static test_mm512_load_si512_data_model g_test_mm512_load_si512_data = {
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+};
+
+typedef struct {
+    int32_t a[16];
+    int32_t expect[16];
+} test_mm512_loadu_si512_data_model;
+static test_mm512_loadu_si512_data_model g_test_mm512_loadu_si512_data = {
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+};
+
+typedef struct {
+    int8_t src;
+    unsigned long long mask;
+    int8_t mem_addr[64];
+    int8_t expect[64];
+} test_mm512_mask_loadu_epi8_data_model;
+static test_mm512_mask_loadu_epi8_data_model g_test_mm512_mask_loadu_epi8_data = {
+    0x01,
+    0xFFFFFF00000000FFULL,
+    {0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+     0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+     0x09, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00,
+     0x0D, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00},
+
+    {0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
+     0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x0B, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00,
+     0x0D, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00},
+};
+
+typedef struct {
+    unsigned long long k;
+    int8_t mem_addr[64];
+    int8_t expect[64];
+} test_mm512_maskz_loadu_epi8_data_model;
+static test_mm512_maskz_loadu_epi8_data_model g_test_mm512_maskz_loadu_epi8_data = {
+    0xFFFFFF00000000FFULL,
+    {0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+     0x05, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00,
+     0x09, 0x00, 0x00, 0x00, 0x0A, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00,
+     0x0D, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00},
+
+    {0x01, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x0C, 0x00, 0x00, 0x00,
+     0x0D, 0x00, 0x00, 0x00, 0x0E, 0x00, 0x00, 0x00, 0x0F, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00},
+};
+
+typedef struct {
+    int8_t a[64];
+    int8_t expect[64];
+} test_mm512_abs_epi8_data_model;
+static test_mm512_abs_epi8_data_model g_test_mm512_abs_epi8_data = {
+    {1,  0,  0,  0,  2,  0,  0,  0,  -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2,
+     -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, -1, -2, -3, -4, 11, 0,  0,  0,
+     12, 0,  0,  0,  13, 0,  0,  0,  14, 0,  0,  0,  15, 0,  0,  0,  16, 0,  0,  0},
+
+    {1, 0, 0, 0, 2, 0, 0, 0, 1,  2, 3, 4, 1,  2, 3, 4, 1,  2, 3, 4, 1,  2, 3, 4, 1,  2, 3, 4, 1,  2, 3, 4,
+     1, 2, 3, 4, 1, 2, 3, 4, 11, 0, 0, 0, 12, 0, 0, 0, 13, 0, 0, 0, 14, 0, 0, 0, 15, 0, 0, 0, 16, 0, 0, 0},
+};
+
+typedef struct {
+    int64_t a[2];
+    int64_t expect[4];
+} test_mm256_broadcastq_epi64_data_model;
+static test_mm256_broadcastq_epi64_data_model g_test_mm256_broadcastq_epi64_data = {
+    {2019LL, 2018LL},
+    {2019LL, 2019LL, 2019LL, 2019LL},
+};
+
+typedef struct {
+    int64_t a[2];
+    int64_t expect[4];
+} test_mm256_broadcastsi128_si256_data_model;
+static test_mm256_broadcastsi128_si256_data_model g_test_mm256_broadcastsi128_si256_data = {
+    {2019LL, 2018LL},
+    {2019LL, 2018LL, 2019LL, 2018LL},
+};
+
+typedef struct {
+    int32_t a[4];
+    int32_t expect[16];
+} test_mm512_broadcast_i32x4_data_model;
+static test_mm512_broadcast_i32x4_data_model g_test_mm512_broadcast_i32x4_data = {
+    {2016, 2017, 2018, 2019},
+    {2016, 2017, 2018, 2019, 2016, 2017, 2018, 2019, 2016, 2017, 2018, 2019, 2016, 2017, 2018, 2019}};
+
+typedef struct {
+    int64_t a[4];
+    int64_t expect[8];
+} test_mm512_broadcast_i64x4_data_model;
+static test_mm512_broadcast_i64x4_data_model g_test_mm512_broadcast_i64x4_data = {
+    {2016LL, 2017LL, 2018LL, 2019LL}, {2016LL, 2017LL, 2018LL, 2019LL, 2016LL, 2017LL, 2018LL, 2019LL}};
+
+typedef struct {
+    int64_t src[8];
+    unsigned char k;
+    int64_t a[4];
+    int64_t expect[8];
+} test_mm512_mask_broadcast_i64x4_data_model;
+static test_mm512_mask_broadcast_i64x4_data_model g_test_mm512_mask_broadcast_i64x4_data = {
+    {2000LL, 2004LL, 2008LL, 2012LL, 2016LL, 2020LL, 2024LL, 2028LL},
+    0xAA,
+    {2018LL, 2019LL, 2018LL, 2019LL},
+    {2000LL, 2019LL, 2008LL, 2019LL, 2016LL, 2019LL, 2024LL, 2019LL},
+};
+
+typedef struct {
+    float64_t a[2];
+    float64_t expect[4];
+} test_mm256_castpd128_pd256_data_model;
+static test_mm256_castpd128_pd256_data_model g_test_mm256_castpd128_pd256_data = {
+    {2.019, 1.119},
+    {2.019, 1.119, 0.0, 0.0},
+};
+
+typedef struct {
+    float64_t a[4];
+    float64_t expect[2];
+} test_mm256_castpd256_pd128_data_model;
+static test_mm256_castpd256_pd128_data_model g_test_mm256_castpd256_pd128_data = {
+    {2.019, 1.1, 1.9, 1.2},
+    {2.019, 1.1},
+};
+
+typedef struct {
+    float32_t a[4];
+    float32_t expect[8];
+} test_mm256_castps128_ps256_data_model;
+static test_mm256_castps128_ps256_data_model g_test_mm256_castps128_ps256_data = {
+    {2.019f, 1.1f, 1.9f, 1.2f},
+    {2.019f, 1.1f, 1.9f, 1.2f, 0.0f, 0.0f, 0.0f, 0.0f},
+};
+
+typedef struct {
+    float32_t a[8];
+    float32_t expect[4];
+} test_mm256_castps256_ps128_data_model;
+static test_mm256_castps256_ps128_data_model g_test_mm256_castps256_ps128_data = {
+    {2.0f, 1.9f, 1.0f, 1.0f, 1.0f, 9.0f, 1.2f, 0.8f},
+    {2.0f, 1.9f, 1.0f, 1.0f},
+};
+
+typedef struct {
+    int32_t a[4];
+    int32_t expect[8];
+} test_mm256_castsi128_si256_data_model;
+static test_mm256_castsi128_si256_data_model g_test_mm256_castsi128_si256_data = {
+    {2019, 11, 19, 12},
+    {2019, 11, 19, 12, 0, 0, 0, 0},
+};
+
+typedef struct {
+    int32_t a[8];
+    float32_t expect[8];
+} test_mm256_castsi256_ps_data_model;
+static test_mm256_castsi256_ps_data_model g_test_mm256_castsi256_ps_data = {
+    {1065353216, 1073741824, 1077936128, 1082130432, 1084227584, 1086324736, 1088421888, 1090519040},
+    {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+};
+
+typedef struct {
+    int32_t a[8];
+    int32_t expect[4];
+} test_mm256_castsi256_si128_data_model;
+static test_mm256_castsi256_si128_data_model g_test_mm256_castsi256_si128_data = {
+    {1, 2, 3, 4, 5, 6, 7, 8},
+    {1, 2, 3, 4},
+};
+
+typedef struct {
+    int32_t a[4];
+    float64_t expect[4];
+} test_mm256_cvtepi32_pd_data_model;
+static test_mm256_cvtepi32_pd_data_model g_test_mm256_cvtepi32_pd_data = {
+    {2019, 11, 19, 12},
+    {2019.0, 11.0, 19.0, 12.0},
+};
+
+typedef struct {
+    int32_t a[8];
+    float32_t expect[8];
+} test_mm256_cvtepi32_ps_data_model;
+static test_mm256_cvtepi32_ps_data_model g_test_mm256_cvtepi32_ps_data = {
+    {1, 2, 3, 4, 5, 6, 7, 8},
+    {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f},
+};
+
+typedef struct {
+    int8_t a[32];
+    int8_t b[32];
+    const int count;
+    int8_t expect[32];
+} test_mm256_alignr_epi8_data_model;
+static test_mm256_alignr_epi8_data_model g_test_mm256_alignr_epi8_data = {
+    {-84, -84, -7,  -9, -38, -10, -9,  -102, 27, -85, 53, 59, -95, -7,  51, -18,
+     -6,  66,  -48, 68, 108, 16,  104, 62,   83, 17,  73, 15, 34,  -93, 37, -76},
+    {94, 99,  54,  80, 50,  101, 45, -38, -87, -115, 96,  -122, -43,  -71, 28, 95,
+     26, 116, -70, 43, -69, -28, 42, 4,   105, 8,    -52, 106,  -120, -43, 18, -97},
+    16,
+    {-84, -84, -7,  -9, -38, -10, -9,  -102, 27, -85, 53, 59, -95, -7,  51, -18,
+     -6,  66,  -48, 68, 108, 16,  104, 62,   83, 17,  73, 15, 34,  -93, 37, -76}};
+
+typedef struct {
+    int16_t a[8];
+    int16_t b[8];
+    int la;
+    int lb;
+    const int imm8;
+    int expect;
+} test_mm_cmpestri_data_model;
+static test_mm_cmpestri_data_model g_test_mm_cmpestri_data_model_data = {{13, 6, 5, 4, 3, 2, 1, 3},
+                                                                         {-7, 16, 5, 4, -1, 6, 1, 3},
+                                                                         10,
+                                                                         10,
+                                                                         _SIDD_SWORD_OPS | _SIDD_CMP_RANGES |
+                                                                             _SIDD_MOST_SIGNIFICANT,
+                                                                         7};
+
+typedef struct {
+    int16_t a[8];
+    int16_t b[8];
+    int la;
+    int lb;
+    const int imm8;
+    int16_t expect[8];
+} test_mm_cmpestrm_data_model;
+static test_mm_cmpestrm_data_model g_test_mm_cmpestrm_data_model_data = {{13, 6, 5, 4, 3, 2, 1, 3},
+                                                                         {-7, 16, 5, 4, -1, 6, 1, 3},
+                                                                         10,
+                                                                         10,
+                                                                         _SIDD_SWORD_OPS | _SIDD_CMP_RANGES |
+                                                                             _SIDD_MOST_SIGNIFICANT,
+                                                                         {0, 0, 0, 0, 0, 0, -1, -1}};
+
+typedef struct {
+    int32_t a[4];
+    int i;
+    const int imm8;
+    int32_t expect[8];
+} test_mm_insert_epi32_data_model;
+static test_mm_insert_epi32_data_model g_test_mm_insert_epi32_data = {
+    {2019, 12, 17, 12}, 1314, 3, {2019, 12, 17, 1314}};
+
+typedef struct {
+    int32_t a[8];
+    int32_t i;
+    const int index;
+    int32_t expect[8];
+} test_mm256_insert_epi32_data_model;
+static test_mm256_insert_epi32_data_model g_test_mm256_insert_epi32_data = {
+    {20, 19, 0, 12, 0, 17, 11, 28}, 12, 6, {20, 19, 0, 12, 0, 17, 12, 28}};
+
+typedef struct {
+    int64_t a[4];
+    int64_t i;
+    const int index;
+    int64_t expect[4];
+} test_mm256_insert_epi64_data_model;
+static test_mm256_insert_epi64_data_model g_test_mm256_insert_epi64_data = {
+    {2019, 12, 17, 12}, 1314, 3, {2019, 12, 17, 1314}};
+
+typedef struct {
+    double a[2];
+    double expect[8];
+} test_mm512_castpd128_pd512_data_model;
+static test_mm512_castpd128_pd512_data_model g_test_mm512_castpd128_pd512_data = {
+    {2.019, 1.217}, {2.019, 1.217, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}};
+
+typedef struct {
+    double a[8];
+    double expect[2];
+} test_mm512_castpd512_pd128_data_model;
+static test_mm512_castpd512_pd128_data_model g_test_mm512_castpd512_pd128_data = {
+    {2.019, 1.217, 3.14, 3.141, 3.1415, 3.14159, 3.141592, 3.1415926}, {2.019, 1.217}};
+
+typedef struct {
+    float a[4];
+    float expect[16];
+} test_mm512_castps128_ps512_data_model;
+static test_mm512_castps128_ps512_data_model g_test_mm512_castps128_ps512_data = {
+    {2.0f, 1.9f, 1.2f, 1.7f},
+    {2.0f, 1.9f, 1.2f, 1.7f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f}};
+
+typedef struct {
+    float a[16];
+    float expect[4];
+} test_mm512_castps512_ps128_data_model;
+static test_mm512_castps512_ps128_data_model g_test_mm512_castps512_ps128_data = {
+    {2.0f, 1.9f, 1.2f, 1.7f, 3.1f, 3.2f, 3.3f, 3.4f, 3.5f, 3.6f, 3.7f, 3.8f, 3.9f, 4.0f, 4.1f, 4.2f},
+    {2.0f, 1.9f, 1.2f, 1.7f}};
+
+typedef struct {
+    int32_t a[8];
+    double expect[8];
+} test_mm512_cvtepi32_pd_data_model;
+static test_mm512_cvtepi32_pd_data_model g_test_mm512_cvtepi32_pd_data = {
+    {20, 19, 0, 12, 0, 17, 11, 28}, {20.0, 19.0, 0.0, 12.0, 0.0, 17.0, 11.0, 28.0}};
+
+typedef struct {
+    int32_t a[16];
+    float expect[16];
+} test_mm512_cvtepi32_ps_data_model;
+static test_mm512_cvtepi32_ps_data_model g_test_mm512_cvtepi32_ps_data = {
+    {20, 19, 0, 12, 0, 17, 11, 28, 107, 40, 33, 39, 118, 21, 29, 11},
+    {20.0f, 19.0f, 0.0f, 12.0f, 0.0f, 17.0f, 11.0f, 28.0f, 107.0f, 40.0f, 33.0f, 39.0f, 118.0f, 21.0f, 29.0f, 11.0f}};
+
+typedef struct {
+    float a[16];
+    float b[8];
+    int imm8;
+    float expect[16];
+} test_mm512_insertf32x8_data_model;
+static test_mm512_insertf32x8_data_model g_test_mm512_insertf32x8_data = {
+    {2.0f, 1.9f, 1.2f, 1.7f, 3.1f, 3.2f, 3.3f, 3.4f, 3.5f, 3.6f, 3.7f, 3.8f, 3.9f, 4.0f, 4.1f, 4.2f},
+    {2.0f, 1.9f, 0.0f, 1.2f, 0.0f, 1.7f, 1.1f, 2.8f},
+    1,
+    {2.0f, 1.9f, 1.2f, 1.7f, 3.1f, 3.2f, 3.3f, 3.4f, 2.0f, 1.9f, 0.0f, 1.2f, 0.0f, 1.7f, 1.1f, 2.8f}};
+
+typedef struct {
+    double a[8];
+    double b[4];
+    int imm8;
+    double expect[8];
+} test_mm512_insertf64x4_data_model;
+static test_mm512_insertf64x4_data_model g_test_mm512_insertf64x4_data = {
+    {2.019, 1.217, 3.14, 3.141, 3.1415, 3.14159, 3.141592, 3.1415926},
+    {2.0, 1.9, 1.2, 1.7},
+    0,
+    {2.0, 1.9, 1.2, 1.7, 3.1415, 3.14159, 3.141592, 3.1415926}};
+
+typedef struct {
+    int32_t a[16];
+    int32_t b[8];
+    int imm8;
+    int32_t expect[16];
+} test_mm512_inserti32x8_data_model;
+static test_mm512_inserti32x8_data_model g_test_mm512_inserti32x8_data = {
+    {20, 19, 0, 12, 0, 17, 11, 28, 107, 40, 33, 39, 118, 21, 29, 11},
+    {1, 2, 3, 4, 5, 6, 7, 8},
+    1,
+    {20, 19, 0, 12, 0, 17, 11, 28, 1, 2, 3, 4, 5, 6, 7, 8}};
+
+typedef struct {
+    int64_t a[8];
+    int64_t b[4];
+    int imm8;
+    int64_t expect[8];
+} test_mm512_inserti64x4_data_model;
+static test_mm512_inserti64x4_data_model g_test_mm512_inserti64x4_data = {
+    {20, 19, 0, 12, 0, 17, 11, 28}, {1, 2, 3, 4}, 1, {20, 19, 0, 12, 1, 2, 3, 4}};
+
+typedef struct {
+    int32_t idx[16];
+    int32_t a[16];
+    int32_t expect[16];
+} test_mm512_permutexvar_epi32_data_model;
+static test_mm512_permutexvar_epi32_data_model g_test_mm512_permutexvar_epi32_data = {
+    {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16},
+    {1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000},
+    {2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 1000},
+};
+
+typedef struct {
+    float32_t a[16];
+    float32_t b[16];
+    __mmask16 expect[32];
+} test_mm512_cmp_ps_mask_data_model;
+static test_mm512_cmp_ps_mask_data_model g_test_mm512_cmp_ps_mask_data1 = {
+    {-3.200000, NAN, 89.769997, 65.000000, 3336.399902, -98.000000, 1002.000000, -88.653999, NAN, 0.000000, \
+     -12.500000, 6678.346191, 453.345001, NAN, -477.345001, 2134.333008,},
+    {-78.000000, NAN, -6.200000, 2.000000, 41.200001, NAN, 95.300003, -88.653999, NAN, 88.690002, -100.639999, NAN, \
+     6678.346191, 856.435730, 6678.346191, 41124.238281},
+    {128, 53760, 53888, 10530, 65407, 11775, 11647, 55005, 10658, 64290, 64418, 0, 54877, 1245, 1117, 65535, 128, \
+     53760, 53888, 10530, 65407, 11775, 11647, 55005, 10658, 64290, 64418, 0, 54877, 1245, 1117, 65535}
+};
+static test_mm512_cmp_ps_mask_data_model g_test_mm512_cmp_ps_mask_data2 = {
+    {-3.200000, 99.378502, 89.769997, 65.000000, 3336.399902, -98.000000, 1002.000000, -88.653999, 77.690002, \
+     0.000000, -12.500000, 6678.346191, 453.345001, 54646.460938, -477.345001, 2134.333008},
+    {-78.000000, 15.600000, -6.200000, 2.000000, 41.200001, 14.000000, 95.300003, -88.653999, -7.690000, 88.690002, \
+     -100.639999, -13.778000, 6678.346191, 856.435730, 6678.346191, 41124.238281},
+    {128, 53792, 53920, 0, 65407, 11743, 11615, 65535, 128, 53792, 53920, 0, 65407, 11743, 11615, 65535, 128, 53792, \
+     53920, 0, 65407, 11743, 11615, 65535, 128, 53792, 53920, 0, 65407, 11743, 11615, 65535}
+};
+
+typedef struct {
+    float64_t a[8];
+    float64_t b[8];
+    __mmask8 expect[32];
+} test_mm512_cmp_pd_mask_data_model;
+static test_mm512_cmp_pd_mask_data_model g_test_mm512_cmp_pd_mask_data1 = {
+    {-3.200000, 99.378500, 89.770000, 65.000000, NAN, -88.654000, NAN, 0.000000},
+    {NAN, 15.600000, -6.200000, 2.000000, 41.200000, 14.000000, NAN, -88.654000},
+    {0, 32, 32, 81, 255, 223, 223, 174, 81, 113, 113, 0, 174, 142, 142, 255, 0, 32, 32, 81, 255, 223, 223, 174, 81, \
+     113, 113, 0, 174, 142, 142, 255}
+};
+static test_mm512_cmp_pd_mask_data_model g_test_mm512_cmp_pd_mask_data2 = {
+    {-3.200000, 99.378500, 89.770000, 65.000000, 1002.000000, -88.654000, 77.690000, 0.000000},
+    {-78.000000, 15.600000, -6.200000, 2.000000, 41.200000, 14.000000, 95.300000, -88.654000},
+    {0, 96, 96, 0, 255, 159, 159, 255, 0, 96, 96, 0, 255, 159, 159, 255, 0, 96, 96, 0, 255, 159, 159, 255, 0, 96, 96, \
+     0, 255, 159, 159, 255}
+};
+
+static __m256d test_mm256_cmp_pd_data_model_unordered_data1 = {NAN, 6678.346, 453.345635, NAN};
+static __m256d test_mm256_cmp_pd_data_model_unordered_data2 = {NAN, 6678.346, NAN, 856.43576};
+static long long test_mm256_cmp_pd_data_model_unordered_ret[32][4] = {
+    {0, -1, 0, 0},    {0, 0, 0, 0},    {0, -1, 0, 0},    {-1, 0, -1, -1},  {-1, 0, -1, -1},  {-1, -1, -1, -1},
+    {-1, 0, -1, -1},  {0, -1, 0, 0},   {-1, -1, -1, -1}, {-1, 0, -1, -1},  {-1, -1, -1, -1}, {0, 0, 0, 0},
+    {0, 0, 0, 0},     {0, -1, 0, 0},   {0, 0, 0, 0},     {-1, -1, -1, -1}, {0, -1, 0, 0},    {0, 0, 0, 0},
+    {0, -1, 0, 0},    {-1, 0, -1, -1}, {-1, 0, -1, -1},  {-1, -1, -1, -1}, {-1, 0, -1, -1},  {0, -1, 0, 0},
+    {-1, -1, -1, -1}, {-1, 0, -1, -1}, {-1, -1, -1, -1}, {0, 0, 0, 0},     {0, 0, 0, 0},     {0, -1, 0, 0},
+    {0, 0, 0, 0},     {-1, -1, -1, -1}};
+static __m256d test_mm256_cmp_pd_data_model_ordered_data1 = {2134.3343, 6678.346, 453.345635, 54646.464356};
+static __m256d test_mm256_cmp_pd_data_model_ordered_data2 = {41124.234, 6678.346, 8653.65635, 856.43576};
+static long long test_mm256_cmp_pd_data_model_ordered_ret[32][4] = {
+    {0, -1, 0, 0},   {-1, 0, -1, 0},   {-1, -1, -1, 0}, {0, 0, 0, 0},     {-1, 0, -1, -1}, {0, -1, 0, -1},
+    {0, 0, 0, -1},   {-1, -1, -1, -1}, {0, -1, 0, 0},   {-1, 0, -1, 0},   {-1, -1, -1, 0}, {0, 0, 0, 0},
+    {-1, 0, -1, -1}, {0, -1, 0, -1},   {0, 0, 0, -1},   {-1, -1, -1, -1}, {0, -1, 0, 0},   {-1, 0, -1, 0},
+    {-1, -1, -1, 0}, {0, 0, 0, 0},     {-1, 0, -1, -1}, {0, -1, 0, -1},   {0, 0, 0, -1},   {-1, -1, -1, -1},
+    {0, -1, 0, 0},   {-1, 0, -1, 0},   {-1, -1, -1, 0}, {0, 0, 0, 0},     {-1, 0, -1, -1}, {0, -1, 0, -1},
+    {0, 0, 0, -1},   {-1, -1, -1, -1}};
+
+static __m256 test_mm256_cmp_ps_data_model_unordered_data1 = {77.690002, NAN, 0.000000, -12.500000, NAN, 6678.346191, \
+                                                              453.345001, NAN};
+static __m256 test_mm256_cmp_ps_data_model_unordered_data2 = {-7.690000, 88.690002, -100.639999, -13.778000, NAN, \
+                                                              6678.346191, NAN, 856.778015};
+static int test_mm256_cmp_ps_data_model_unordered_ret[32][8] = {
+    {0, 0, 0, 0, 0, -1, 0, 0},        {0, 0, 0, 0, 0, 0, 0, 0},        {0, 0, 0, 0, 0, -1, 0, 0}, 
+    {0, -1, 0, 0, -1, 0, -1, -1},     {-1, -1, -1, -1, -1, 0, -1, -1}, {-1, -1, -1, -1, -1, -1, -1, -1}, 
+    {-1, -1, -1, -1, -1, 0, -1, -1},  {-1, 0, -1, -1, 0, -1, 0, 0},    {0, -1, 0, 0, -1, -1, -1, -1}, 
+    {0, -1, 0, 0, -1, 0, -1, -1},     {0, -1, 0, 0, -1, -1, -1, -1},   {0, 0, 0, 0, 0, 0, 0, 0},
+    {-1, 0, -1, -1, 0, 0, 0, 0},      {-1, 0, -1, -1, 0, -1, 0, 0},    {-1, 0, -1, -1, 0, 0, 0, 0}, 
+    {-1, -1, -1, -1, -1, -1, -1, -1}, {0, 0, 0, 0, 0, -1, 0, 0},       {0, 0, 0, 0, 0, 0, 0, 0}, 
+    {0, 0, 0, 0, 0, -1, 0, 0},        {0, -1, 0, 0, -1, 0, -1, -1},    {-1, -1, -1, -1, -1, 0, -1, -1}, 
+    {-1, -1, -1, -1, -1, -1, -1, -1}, {-1, -1, -1, -1, -1, 0, -1, -1}, {-1, 0, -1, -1, 0, -1, 0, 0}, 
+    {0, -1, 0, 0, -1, -1, -1, -1},    {0, -1, 0, 0, -1, 0, -1, -1},    {0, -1, 0, 0, -1, -1, -1, -1},
+    {0, 0, 0, 0, 0, 0, 0, 0},         {-1, 0, -1, -1, 0, 0, 0, 0},     {-1, 0, -1, -1, 0, -1, 0, 0}, 
+    {-1, 0, -1, -1, 0, 0, 0, 0},      {-1, -1, -1, -1, -1, -1, -1, -1}};
+static __m256 test_mm256_cmp_ps_data_model_ordered_data1 = {77.690002, 0.000000, -12.500000, 6678.346191, 453.345001, \
+                                                            54646.460938, -477.345001, 2134.333008};
+static __m256 test_mm256_cmp_ps_data_model_ordered_data2 = {-7.690000, 88.690002, -100.639999, -13.778000, \
+                                                            6678.346191, 856.435730, 6678.346191, 41124.238281};
+static int test_mm256_cmp_ps_data_model_ordered_ret[32][8] = {
+    {0, 0, 0, 0, 0, 0, 0, 0},         {0, -1, 0, 0, -1, 0, -1, -1},     {0, -1, 0, 0, -1, 0, -1, -1}, 
+    {0, 0, 0, 0, 0, 0, 0, 0},         {-1, -1, -1, -1, -1, -1, -1, -1}, {-1, 0, -1, -1, 0, -1, 0, 0}, 
+    {-1, 0, -1, -1, 0, -1, 0, 0},     {-1, -1, -1, -1, -1, -1, -1, -1}, {0, 0, 0, 0, 0, 0, 0, 0}, 
+    {0, -1, 0, 0, -1, 0, -1, -1},     {0, -1, 0, 0, -1, 0, -1, -1},     {0, 0, 0, 0, 0, 0, 0, 0}, 
+    {-1, -1, -1, -1, -1, -1, -1, -1}, {-1, 0, -1, -1, 0, -1, 0, 0},     {-1, 0, -1, -1, 0, -1, 0, 0},
+    {-1, -1, -1, -1, -1, -1, -1, -1}, {0, 0, 0, 0, 0, 0, 0, 0},         {0, -1, 0, 0, -1, 0, -1, -1}, 
+    {0, -1, 0, 0, -1, 0, -1, -1},     {0, 0, 0, 0, 0, 0, 0, 0},         {-1, -1, -1, -1, -1, -1, -1, -1}, 
+    {-1, 0, -1, -1, 0, -1, 0, 0},     {-1, 0, -1, -1, 0, -1, 0, 0},     {-1, -1, -1, -1, -1, -1, -1, -1}, 
+    {0, 0, 0, 0, 0, 0, 0, 0},         {0, -1, 0, 0, -1, 0, -1, -1},     {0, -1, 0, 0, -1, 0, -1, -1}, 
+    {0, 0, 0, 0, 0, 0, 0, 0},         {-1, -1, -1, -1, -1, -1, -1, -1}, {-1, 0, -1, -1, 0, -1, 0, 0}, 
+    {-1, 0, -1, -1, 0, -1, 0, 0},     {-1, -1, -1, -1, -1, -1, -1, -1}};
+#endif
diff --git a/emmintrin.h b/emmintrin.h
new file mode 100644
index 0000000..1fca37c
--- /dev/null
+++ b/emmintrin.h
@@ -0,0 +1,835 @@
+/*
+ * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ */
+
+#ifndef AVX2NEON_H
+#error Never use <avxintrin512.h> directly; include " avx2neon.h" instead.
+#endif
+
+
+#include <arm_neon.h>
+
+#include <math.h>
+#ifdef __cplusplus
+using namespace std;
+#endif
+
+#include "typedefs.h"
+
+typedef union {
+    int8x16_t vect_s8;
+    int16x8_t vect_s16;
+    int32x4_t vect_s32;
+    int64x2_t vect_s64;
+    uint8x16_t vect_u8;
+    uint16x8_t vect_u16;
+    uint32x4_t vect_u32;
+    uint64x2_t vect_u64;
+} __m128i;
+
+typedef float32x4_t __m128;
+
+typedef float64x2_t __m128d;
+
+typedef enum {
+    _MM_CMPINT_EQ = 0,    /* Equal */
+    _MM_CMPINT_LT = 1,    /* Less than */
+    _MM_CMPINT_LE = 2,    /* Less than or Equal */
+    _MM_CMPINT_FALSE = 3, /* Always False */
+    _MM_CMPINT_NE = 4,    /* Not Equal */
+    _MM_CMPINT_NLT = 5,   /* Not Less than */
+    _MM_CMPINT_NLE = 6,   /* Not Less than or Equal */
+    _MM_CMPINT_TRUE = 7   /* Always True */
+} _MM_CMPINT_ENUM;
+
+static uint64_t g_mask_epi64[2] __attribute__((aligned(16))) = {0x01, 0x02};
+static uint32_t g_mask_epi32[4] __attribute__((aligned(16))) = {0x01, 0x02, 0x04, 0x08};
+static uint16_t g_mask_epi16[8] __attribute__((aligned(16))) = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
+static uint8_t g_mask_epi8[16] __attribute__((aligned(16))) = {
+    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80};
+
+#define _SIDD_UBYTE_OPS 0x00  // unsigned 8-bit characters
+#define _SIDD_UWORD_OPS 0x01  // unsigned 16-bit characters
+#define _SIDD_SBYTE_OPS 0x02  // signed 8-bit characters
+#define _SIDD_SWORD_OPS 0x03  // signed 16-bit characters
+
+#define _SIDD_CMP_EQUAL_ANY 0x00      // compare equal any
+#define _SIDD_CMP_RANGES 0x04         // compare ranges
+#define _SIDD_CMP_EQUAL_EACH 0x08     // compare equal each
+#define _SIDD_CMP_EQUAL_ORDERED 0x0C  // compare equal ordered
+
+#define _SIDD_POSITIVE_POLARITY 0x00
+#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
+#define _SIDD_NEGATIVE_POLARITY 0x10         // negate results
+#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30  // negate results only before end of string
+
+#define _SIDD_LEAST_SIGNIFICANT 0x00  // index only: return last significant bit
+#define _SIDD_MOST_SIGNIFICANT 0x40   // index only: return most significant bit
+
+#define _SIDD_BIT_MASK 0x00   // mask only: return bit mask
+#define _SIDD_UNIT_MASK 0x40  // mask only: return byte/word mask
+
+#define PCMPSTR_EQ_16x8(a, b, mtx)                                                                          \
+    {                                                                                                       \
+        mtx[0].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 0)), a.vect_u16);                \
+        mtx[1].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 1)), a.vect_u16);                \
+        mtx[2].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 2)), a.vect_u16);                \
+        mtx[3].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 3)), a.vect_u16);                \
+        mtx[4].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 4)), a.vect_u16);                \
+        mtx[5].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 5)), a.vect_u16);                \
+        mtx[6].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 6)), a.vect_u16);                \
+        mtx[7].vect_u16 = vceqq_u16(vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 7)), a.vect_u16);                \
+    }
+
+#define PCMPSTR_EQ_8x16(a, b, mtx)                                                                          \
+    {                                                                                                       \
+        mtx[0].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 0)), a.vect_u8);                      \
+        mtx[1].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 1)), a.vect_u8);                      \
+        mtx[2].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 2)), a.vect_u8);                      \
+        mtx[3].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 3)), a.vect_u8);                      \
+        mtx[4].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 4)), a.vect_u8);                      \
+        mtx[5].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 5)), a.vect_u8);                      \
+        mtx[6].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 6)), a.vect_u8);                      \
+        mtx[7].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 7)), a.vect_u8);                      \
+        mtx[8].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 8)), a.vect_u8);                      \
+        mtx[9].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 9)), a.vect_u8);                      \
+        mtx[10].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 10)), a.vect_u8);                    \
+        mtx[11].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 11)), a.vect_u8);                    \
+        mtx[12].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 12)), a.vect_u8);                    \
+        mtx[13].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 13)), a.vect_u8);                    \
+        mtx[14].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 14)), a.vect_u8);                    \
+        mtx[15].vect_u8 = vceqq_u8(vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 15)), a.vect_u8);                    \
+    }
+
+#define PCMPSTR_RNG_U16x8(a, b, mtx)                                                                                \
+    {                                                                                                               \
+        uint16x8_t vect_b[8];                                                                                       \
+        __m128i mask;                                                                                               \
+        mask.vect_u32 = vdupq_n_u32(0xffff);                                                                        \
+        vect_b[0] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 0));                                                     \
+        vect_b[1] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 1));                                                     \
+        vect_b[2] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 2));                                                     \
+        vect_b[3] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 3));                                                     \
+        vect_b[4] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 4));                                                     \
+        vect_b[5] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 5));                                                     \
+        vect_b[6] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 6));                                                     \
+        vect_b[7] = vdupq_n_u16(vgetq_lane_u16(b.vect_u16, 7));                                                     \
+        int i;                                                                                                      \
+        for (i = 0; i < 8; i++) {                                                                                   \
+            mtx[i].vect_u16 = vbslq_u16(mask.vect_u16, vcgeq_u16(vect_b[i], a.vect_u16),                            \
+            vcleq_u16(vect_b[i], a.vect_u16));                                                                      \
+        }                                                                                                           \
+    }
+#define PCMPSTR_RNG_S16x8(a, b, mtx)                                                                                \
+    {                                                                                                               \
+        int16x8_t vect_b[8];                                                                                        \
+        __m128i mask;                                                                                               \
+        mask.vect_u32 = vdupq_n_u32(0xffff);                                                                        \
+        vect_b[0] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 0));                                                     \
+        vect_b[1] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 1));                                                     \
+        vect_b[2] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 2));                                                     \
+        vect_b[3] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 3));                                                     \
+        vect_b[4] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 4));                                                     \
+        vect_b[5] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 5));                                                     \
+        vect_b[6] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 6));                                                     \
+        vect_b[7] = vdupq_n_s16(vgetq_lane_s16(b.vect_s16, 7));                                                     \
+        int i;                                                                                                      \
+        for (i = 0; i < 8; i++) {                                                                                   \
+            mtx[i].vect_u16 = vbslq_u16(mask.vect_u16, vcgeq_s16(vect_b[i], a.vect_s16),                            \
+            vcleq_s16(vect_b[i], a.vect_s16));                                                                      \
+        }                                                                                                           \
+    }
+
+#define PCMPSTR_RNG_U8x16(a, b, mtx)                                                                                \
+    {                                                                                                               \
+        uint8x16_t vect_b[16];                                                                                      \
+        __m128i mask;                                                                                               \
+        mask.vect_u16 = vdupq_n_u16(0xff);                                                                          \
+        vect_b[0] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 0));                                                        \
+        vect_b[1] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 1));                                                        \
+        vect_b[2] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 2));                                                        \
+        vect_b[3] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 3));                                                        \
+        vect_b[4] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 4));                                                        \
+        vect_b[5] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 5));                                                        \
+        vect_b[6] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 6));                                                        \
+        vect_b[7] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 7));                                                        \
+        vect_b[8] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 8));                                                        \
+        vect_b[9] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 9));                                                        \
+        vect_b[10] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 10));                                                      \
+        vect_b[11] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 11));                                                      \
+        vect_b[12] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 12));                                                      \
+        vect_b[13] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 13));                                                      \
+        vect_b[14] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 14));                                                      \
+        vect_b[15] = vdupq_n_u8(vgetq_lane_u8(b.vect_u8, 15));                                                      \
+        int i;                                                                                                      \
+        for (i = 0; i < 16; i++) {                                                                                  \
+            mtx[i].vect_u8 = vbslq_u8(mask.vect_u8, vcgeq_u8(vect_b[i], a.vect_u8), vcleq_u8(vect_b[i], a.vect_u8));\
+        }                                                                                                           \
+    }
+
+#define PCMPSTR_RNG_S8x16(a, b, mtx)                                                                                \
+    {                                                                                                               \
+        int8x16_t vect_b[16];                                                                                       \
+        __m128i mask;                                                                                               \
+        mask.vect_u16 = vdupq_n_u16(0xff);                                                                          \
+        vect_b[0] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 0));                                                        \
+        vect_b[1] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 1));                                                        \
+        vect_b[2] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 2));                                                        \
+        vect_b[3] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 3));                                                        \
+        vect_b[4] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 4));                                                        \
+        vect_b[5] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 5));                                                        \
+        vect_b[6] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 6));                                                        \
+        vect_b[7] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 7));                                                        \
+        vect_b[8] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 8));                                                        \
+        vect_b[9] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 9));                                                        \
+        vect_b[10] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 10));                                                      \
+        vect_b[11] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 11));                                                      \
+        vect_b[12] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 12));                                                      \
+        vect_b[13] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 13));                                                      \
+        vect_b[14] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 14));                                                      \
+        vect_b[15] = vdupq_n_s8(vgetq_lane_s8(b.vect_s8, 15));                                                      \
+        int i;                                                                                                      \
+        for (i = 0; i < 16; i++) {                                                                                  \
+            mtx[i].vect_u8 = vbslq_u8(mask.vect_u8, vcgeq_s8(vect_b[i], a.vect_s8), vcleq_s8(vect_b[i], a.vect_s8));\
+        }                                                                                                           \
+    }
+
+#define SET32x4(res, e0, e1, e2, e3)                     \
+    __asm__ __volatile__ (                                  \
+        "mov %[r].s[0], %w[x]        \n\t"                  \
+        "mov %[r].s[1], %w[y]        \n\t"                  \
+        "mov %[r].s[2], %w[z]        \n\t"                  \
+        "mov %[r].s[3], %w[k]        \n\t"                  \
+        :[r]"=w"(res)                                       \
+        :[x]"r"(e0), [y]"r"(e1), [z]"r"(e2), [k]"r"(e3)     \
+    );
+
+#define  SET64x2(res, e0, e1)                            \
+    __asm__ __volatile__ (                                  \
+        "mov %[r].d[0], %[x]         \n\t"                  \
+        "mov %[r].d[1], %[y]         \n\t"                  \
+        :[r]"=w"(res)                                       \
+        :[x]"r"(e0), [y]"r"(e1)                             \
+    );
+
+/* extract highest bit from every 32bit */ 
+#define PICK_HB_32x16(res, sign)                                                                             \
+    {                                                                                                        \
+        res.vect_u32[0] = vshrq_n_u32(res.vect_u32[0], 31);                                                  \
+        res.vect_u32[1] = vshrq_n_u32(res.vect_u32[1], 31);                                                  \
+        res.vect_u32[2] = vshrq_n_u32(res.vect_u32[2], 31);                                                  \
+        res.vect_u32[3] = vshrq_n_u32(res.vect_u32[3], 31);                                                  \
+        res.vect_u64[0] = vsraq_n_u64(res.vect_u64[0], res.vect_u64[0], 31);                                 \
+        res.vect_u64[1] = vsraq_n_u64(res.vect_u64[1], res.vect_u64[1], 31);                                 \
+        res.vect_u64[2] = vsraq_n_u64(res.vect_u64[2], res.vect_u64[2], 31);                                 \
+        res.vect_u64[3] = vsraq_n_u64(res.vect_u64[3], res.vect_u64[3], 31);                                 \
+        *sign = (vgetq_lane_u8(res.vect_u8[0], 0) | (vgetq_lane_u8(res.vect_u8[0], 8) << 2) |                \
+                (vgetq_lane_u8(res.vect_u8[1], 0) << 4) | (vgetq_lane_u8(res.vect_u8[1], 8) << 6) |          \
+                (vgetq_lane_u8(res.vect_u8[2], 0) << 8) | (vgetq_lane_u8(res.vect_u8[2], 8) << 10) |         \
+                (vgetq_lane_u8(res.vect_u8[3], 0) << 12) | (vgetq_lane_u8(res.vect_u8[3], 8) << 14));        \
+    };
+
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+    return (int)vaddlv_u8(vcnt_u8(vcreate_u8((unsigned __int64)a)));
+}
+
+FORCE_INLINE __int64 _mm_popcnt_u64(unsigned __int64 a)
+{
+    return (__int64)vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+}
+
+FORCE_INLINE __m128i _mm_div_epi8(__m128i a, __m128i b)
+{
+    __m128i res_m128i;
+    int16x8_t ta[2], tb[2];
+    int16x8_t tmp_lo, tmp_hi;
+    int32x4_t la[4], lb[4];
+    float32x4_t res[4];
+
+    ta[0] = vmovl_s8(vget_low_s8(a.vect_s8));
+    tb[0] = vmovl_s8(vget_low_s8(b.vect_s8));
+    ta[1] = vmovl_s8(vget_high_s8(a.vect_s8));
+    tb[1] = vmovl_s8(vget_high_s8(b.vect_s8));
+
+    la[0] = vmovl_s16(vget_low_s16(ta[0]));
+    lb[0] = vmovl_s16(vget_low_s16(tb[0]));
+    la[1] = vmovl_s16(vget_high_s16(ta[0]));
+    lb[1] = vmovl_s16(vget_high_s16(tb[0]));
+    la[2] = vmovl_s16(vget_low_s16(ta[1]));
+    lb[2] = vmovl_s16(vget_low_s16(tb[1]));
+    la[3] = vmovl_s16(vget_high_s16(ta[1]));
+    lb[3] = vmovl_s16(vget_high_s16(tb[1]));
+
+    res[1] = vdivq_f32(vcvtq_f32_s32(la[1]), vcvtq_f32_s32(lb[1]));
+    res[2] = vdivq_f32(vcvtq_f32_s32(la[2]), vcvtq_f32_s32(lb[2]));
+    res[0] = vdivq_f32(vcvtq_f32_s32(la[0]), vcvtq_f32_s32(lb[0]));
+    res[3] = vdivq_f32(vcvtq_f32_s32(la[3]), vcvtq_f32_s32(lb[3]));
+    tmp_lo = vcombine_s16(vmovn_s32(vcvtq_s32_f32(res[0])), vmovn_s32(vcvtq_s32_f32(res[1])));
+    tmp_hi = vcombine_s16(vmovn_s32(vcvtq_s32_f32(res[2])), vmovn_s32(vcvtq_s32_f32(res[3])));
+    res_m128i.vect_s8 = vcombine_s8(vmovn_s16(tmp_lo), vmovn_s16(tmp_hi));
+
+    return res_m128i;
+}
+
+FORCE_INLINE __m128i _mm_div_epi16(__m128i a, __m128i b)
+{
+    __m128i res_m128i;
+    float32x4_t fa[2], fb[2];
+    float32x4_t res_lo, res_hi;
+    fa[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(a.vect_s16)));
+    fb[0] = vcvtq_f32_s32(vmovl_s16(vget_low_s16(b.vect_s16)));    
+    fa[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(a.vect_s16)));
+    fb[1] = vcvtq_f32_s32(vmovl_s16(vget_high_s16(b.vect_s16)));
+    res_lo = vdivq_f32(fa[0], fb[0]);
+    res_hi = vdivq_f32(fa[1], fb[1]);
+    res_m128i.vect_s16 = vcombine_s16(vmovn_s32(vcvtq_s32_f32(res_lo)), vmovn_s32(vcvtq_s32_f32(res_hi)));
+    return res_m128i;
+}
+
+FORCE_INLINE __m128i _mm_div_epu8(__m128i a, __m128i b)
+{
+    __m128i res_m128i;
+    uint16x8_t ta[2], tb[2];
+    uint16x8_t tmp_lo, tmp_hi;
+    uint32x4_t la[4], lb[4];
+    float32x4_t res[4];
+
+    ta[0] = vmovl_u8(vget_low_u8(a.vect_u8));
+    tb[0] = vmovl_u8(vget_low_u8(b.vect_u8));
+    ta[1] = vmovl_u8(vget_high_u8(a.vect_u8));
+    tb[1] = vmovl_u8(vget_high_u8(b.vect_u8));
+
+    la[0] = vmovl_u16(vget_low_u16(ta[0]));
+    lb[0] = vmovl_u16(vget_low_u16(tb[0]));
+    la[1] = vmovl_u16(vget_high_u16(ta[0]));
+    lb[1] = vmovl_u16(vget_high_u16(tb[0]));
+    la[2] = vmovl_u16(vget_low_u16(ta[1]));
+    lb[2] = vmovl_u16(vget_low_u16(tb[1]));
+    la[3] = vmovl_u16(vget_high_u16(ta[1]));
+    lb[3] = vmovl_u16(vget_high_u16(tb[1]));
+
+    res[1] = vdivq_f32(vcvtq_f32_u32(la[1]), vcvtq_f32_u32(lb[1]));
+    res[2] = vdivq_f32(vcvtq_f32_u32(la[2]), vcvtq_f32_u32(lb[2]));
+    res[0] = vdivq_f32(vcvtq_f32_u32(la[0]), vcvtq_f32_u32(lb[0]));
+    res[3] = vdivq_f32(vcvtq_f32_u32(la[3]), vcvtq_f32_u32(lb[3]));
+    tmp_lo = vcombine_u16(vmovn_u32(vcvtq_u32_f32(res[0])), vmovn_u32(vcvtq_u32_f32(res[1])));
+    tmp_hi = vcombine_u16(vmovn_u32(vcvtq_u32_f32(res[2])), vmovn_u32(vcvtq_u32_f32(res[3])));
+    res_m128i.vect_u8 = vcombine_u8(vmovn_u16(tmp_lo), vmovn_u16(tmp_hi));
+
+    return res_m128i;
+}
+
+FORCE_INLINE __m128i _mm_div_epu16(__m128i a, __m128i b)
+{
+    __m128i res_m128i;
+    float32x4_t fa[2], fb[2];
+    float32x4_t res_lo, res_hi;
+    fa[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(a.vect_u16)));
+    fb[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(b.vect_u16)));    
+    fa[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(a.vect_u16)));
+    fb[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(b.vect_u16)));
+    res_lo = vdivq_f32(fa[0], fb[0]);
+    res_hi = vdivq_f32(fa[1], fb[1]);
+    res_m128i.vect_u16 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(res_lo)), vmovn_u32(vcvtq_u32_f32(res_hi)));
+    return res_m128i;
+}
+
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    long long c = count.vect_s64[0];
+    int mc = c;
+    __m128i result_m128i;
+    if (likely(c >= 0 && c < 64)) {
+        result_m128i.vect_s64 = vshlq_n_s64(a.vect_s64, mc);
+    } else {
+        result_m128i.vect_s64 = vdupq_n_s64(0);
+    } 
+    return result_m128i;
+}
+
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+    __m128i result_m128i;
+    result_m128i.vect_u64 = vceqq_s64(a.vect_s64, b.vect_s64);
+    return result_m128i;
+}
+
+FORCE_INLINE __m128 _mm_move_ss (__m128 a, __m128 b)
+{
+    __asm__ __volatile__(
+        "mov %0.s[0], %1.s[0]        \n\t"
+        :"+w"(a)
+        :"w"(b)
+    );
+    return a;
+}
+
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    __asm__ __volatile__(
+        "mov %0.d[0], %1.d[0]        \n\t"
+        :"+w"(a)
+        :"w"(b)
+    );
+    return a;
+}
+
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    __m128i res_m128i;
+    res_m128i.vect_s64 = vandq_s64(a.vect_s64, b.vect_s64);
+    return !(vgetq_lane_s64(res_m128i.vect_s64, 0) | vgetq_lane_s64(res_m128i.vect_s64, 1));
+}
+
+FORCE_INLINE int _mm_extract_epi32(__m128i a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 3);
+    return a.vect_s32[imm8];
+}
+
+FORCE_INLINE int _mm_extract_ps (__m128 a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 3);
+    return vreinterpretq_s32_f32(a)[imm8];
+}
+
+FORCE_INLINE __int64 _mm_extract_epi64 (__m128i a, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 1);
+    return a.vect_s64[imm8];
+}
+
+FORCE_INLINE unsigned int _mm_crc32_u8 (unsigned int crc, unsigned char v)
+{
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v));
+    return crc;
+}
+
+FORCE_INLINE unsigned int _mm_crc32_u16(unsigned int crc, unsigned short v)
+{
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v));
+    return crc;
+}
+
+FORCE_INLINE unsigned int _mm_crc32_u32(unsigned int crc, unsigned int v)
+{
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v));
+    return crc;
+}
+
+FORCE_INLINE unsigned __int64 _mm_crc32_u64(unsigned __int64 crc, unsigned __int64 v)
+{
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" : [c] "+r"(crc) : [v] "r"(v));
+    return crc;
+}
+
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    __m128d res_m128d;
+    SET64x2(res_m128d, e0, e1);
+    return res_m128d;
+}
+
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t a)
+{
+    __m128i ret;
+    ret.vect_s64 = vdupq_n_s64(a);
+    return ret;
+}
+
+FORCE_INLINE __m128d _mm_set1_pd(double a)
+{
+    return vdupq_n_f64(a);
+}
+
+FORCE_INLINE __m128i _mm_set_epi32(int e3, int e2, int e1, int e0)
+{
+    __m128i res_m128i;
+    SET32x4(res_m128i.vect_s32, e0, e1, e2, e3);
+    return res_m128i;
+}
+
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t e1, int64_t e0)
+{
+    __m128i res_m128i;
+    SET64x2(res_m128i.vect_s64, e0, e1);
+    return res_m128i;
+}
+
+FORCE_INLINE __m128 _mm_set_ps(float e3, float e2, float e1, float e0)
+{
+    __m128 res_m128;
+    SET32x4(res_m128, e0, e1, e2, e3);
+    return res_m128;
+}
+
+static int aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int j;
+    int m = (1 << la) - 1;
+    uint8x8_t vect_mask = vld1_u8(g_mask_epi8);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vect_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vect_mask);
+    uint8x16_t vect = vcombine_u8(t_lo, t_hi);
+    for (j = 0; j < lb; j++) {
+        mtx[j].vect_u8 = vandq_u8(vect, mtx[j].vect_u8);
+        mtx[j].vect_u8 = vshrq_n_u8(mtx[j].vect_u8, 7);
+        int tmp = vaddvq_u8(mtx[j].vect_u8) ? 1 : 0;
+        res |= ( tmp << j);
+    }
+    return res;
+}
+
+static int aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int j;
+    int m = (1 << la) - 1;
+    uint16x8_t vect = vtstq_u16(vdupq_n_u16(m), vld1q_u16(g_mask_epi16));
+    for (j = 0; j < lb; j++) {
+        mtx[j].vect_u16 = vandq_u16(vect, mtx[j].vect_u16);
+        mtx[j].vect_u16 = vshrq_n_u16(mtx[j].vect_u16, 15);
+        int tmp = vaddvq_u16(mtx[j].vect_u16) ? 1 : 0;
+        res |= (tmp << j);
+    }
+    return res;
+}
+
+static int cal_res_byte_equal_any(__m128i a, int la, __m128i b, int lb)
+{
+    __m128i mtx[16];
+    PCMPSTR_EQ_8x16(a, b, mtx);
+    return aggregate_equal_any_8x16(la, lb, mtx);
+}
+
+static int cal_res_word_equal_any(__m128i a, int la, __m128i b, int lb)
+{
+    __m128i mtx[16];
+    PCMPSTR_EQ_16x8(a, b, mtx);
+    return aggregate_equal_any_16x8(la, lb, mtx);
+}
+
+static int aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int j;
+    int m = (1 << la) - 1;
+    uint16x8_t vect = vtstq_u16(vdupq_n_u16(m), vld1q_u16(g_mask_epi16));
+    for (j = 0; j < lb; j++) {
+        mtx[j].vect_u16 = vandq_u16(vect, mtx[j].vect_u16);
+        mtx[j].vect_u16 = vshrq_n_u16(mtx[j].vect_u16, 15);
+        __m128i tmp;
+        tmp.vect_u32 = vshrq_n_u32(mtx[j].vect_u32, 16);
+        uint32x4_t vect_res = vandq_u32(mtx[j].vect_u32, tmp.vect_u32);
+        int t = vaddvq_u32(vect_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int j;
+    int m = (1 << la) - 1;
+    uint8x8_t vect_mask = vld1_u8(g_mask_epi8);
+    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vect_mask);
+    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vect_mask);
+    uint8x16_t vect = vcombine_u8(t_lo, t_hi);
+    for (j = 0; j < lb; j++) {
+        mtx[j].vect_u8 = vandq_u8(vect, mtx[j].vect_u8);
+        mtx[j].vect_u8 = vshrq_n_u8(mtx[j].vect_u8, 7);
+        __m128i tmp;
+        tmp.vect_u16 = vshrq_n_u16(mtx[j].vect_u16, 8);
+        uint16x8_t vect_res = vandq_u16(mtx[j].vect_u16, tmp.vect_u16);
+        int t = vaddvq_u16(vect_res) ? 1 : 0;
+        res |= (t << j);
+    }
+    return res;
+}
+
+static int cal_res_ubyte_ranges(__m128i a, int la, __m128i b, int lb)
+{
+    __m128i mtx[16];
+    PCMPSTR_RNG_U8x16(a, b, mtx);
+    return aggregate_ranges_8x16(la, lb, mtx);
+}
+
+static int cal_res_sbyte_ranges(__m128i a, int la, __m128i b, int lb)
+{
+    __m128i mtx[16];
+    PCMPSTR_RNG_S8x16(a, b, mtx);
+    return aggregate_ranges_8x16(la, lb, mtx);
+}
+
+static int cal_res_uword_ranges(__m128i a, int la, __m128i b, int lb)
+{
+    __m128i mtx[16];
+    PCMPSTR_RNG_U16x8(a, b, mtx);
+    return aggregate_ranges_16x8(la, lb, mtx);
+}
+
+static int cal_res_sword_ranges(__m128i a, int la, __m128i b, int lb)
+{
+    __m128i mtx[16];
+    PCMPSTR_RNG_S16x8(a, b, mtx);
+    return aggregate_ranges_16x8(la, lb, mtx);
+}
+
+static int cal_res_byte_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint8x16_t mtx = vceqq_u8(a.vect_u8, b.vect_u8);
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x10000 - (1 << la);
+    int tb = 0x10000 - (1 << lb);
+    uint8x8_t vect_mask, vect0_lo, vect0_hi, vect1_lo, vect1_hi;
+    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
+    vect_mask = vld1_u8(g_mask_epi8);
+    vect0_lo = vtst_u8(vdup_n_u8(m0), vect_mask);
+    vect0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vect_mask);
+    vect1_lo = vtst_u8(vdup_n_u8(m1), vect_mask);
+    vect1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vect_mask);
+    tmp_lo = vtst_u8(vdup_n_u8(tb), vect_mask);
+    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vect_mask);
+
+    res_lo = vbsl_u8(vect0_lo, vdup_n_u8(0), vget_low_u8(mtx));
+    res_hi = vbsl_u8(vect0_hi, vdup_n_u8(0), vget_high_u8(mtx));
+    res_lo = vbsl_u8(vect1_lo, tmp_lo, res_lo);
+    res_hi = vbsl_u8(vect1_hi, tmp_hi, res_hi);
+    res_lo = vand_u8(res_lo, vect_mask);
+    res_hi = vand_u8(res_hi, vect_mask);
+
+    int res = vaddv_u8(res_lo) + (vaddv_u8(res_hi) << 8);
+    return res;
+}
+
+static int cal_res_word_equal_each(__m128i a, int la, __m128i b, int lb)
+{
+    uint16x8_t mtx = vceqq_u16(a.vect_u16, b.vect_u16);
+    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
+    int m1 = 0x100 - (1 << la);
+    int tb = 0x100 - (1 << lb);
+    uint16x8_t vect_mask = vld1q_u16(g_mask_epi16);
+    uint16x8_t vect0 = vtstq_u16(vdupq_n_u16(m0), vect_mask);
+    uint16x8_t vect1 = vtstq_u16(vdupq_n_u16(m1), vect_mask);
+    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vect_mask);
+    mtx = vbslq_u16(vect0, vdupq_n_u16(0), mtx);
+    mtx = vbslq_u16(vect1, tmp, mtx);
+    mtx = vandq_u16(mtx, vect_mask);
+    return vaddvq_u16(mtx);
+}
+
+static int aggregate_equal_ordered_8x16(int bound, int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int j, k;
+    int m1 = 0x10000 - (1 << la);
+    uint8x16_t vect_mask = vld1q_u8(g_mask_epi8);
+    uint8x16_t vect1 = vtstq_u8(vdupq_n_u8(m1), vect_mask);
+    uint8x16_t vect_minusone = vdupq_n_u8(-1);
+    uint8x16_t vect_zero = vdupq_n_u8(0);
+    for (j = 0; j < lb; j++) {
+        mtx[j].vect_u8 = vbslq_u8(vect1, vect_minusone, mtx[j].vect_u8);
+    }
+    for (j = lb; j < bound; j++) {
+        mtx[j].vect_u8 = vbslq_u8(vect1, vect_minusone, vect_zero);
+    }
+    uint8_t enable[16] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+    for (j = 0; j < bound; j++) {
+        int val = 1;
+        uint8x16_t vect_en = vld1q_u8(enable);
+        for (k = j; k < bound && val == 1; k++) {
+            int t = vaddvq_u8(vandq_u8(mtx[j].vect_u8, vect_en));
+            val = (t == bound - j) ? 1 : 0;
+        }
+        res = (val << j) + res;
+        enable[bound - 1 - j] = 0;
+    }
+    return res;
+}
+
+static int aggregate_equal_ordered_16x8(int bound, int la, int lb, __m128i mtx[16])
+{
+    int res = 0;
+    int j, k;
+    int m1 = 0x100 - (1 << la);
+    uint16x8_t vect_mask = vld1q_u16(g_mask_epi16);
+    uint16x8_t vect1 = vtstq_u16(vdupq_n_u16(m1), vect_mask);
+    uint16x8_t vect_minusone = vdupq_n_u16(-1);
+    uint16x8_t vect_zero = vdupq_n_u16(0);
+    for (j = 0; j < lb; j++) {
+        mtx[j].vect_u16 = vbslq_u16(vect1, vect_minusone, mtx[j].vect_u16);
+    }
+    for (j = lb; j < bound; j++) {
+        mtx[j].vect_u16 = vbslq_u16(vect1, vect_minusone, vect_zero);
+    }
+    uint16_t enable[8] = {1, 1, 1, 1, 1, 1, 1, 1};
+    for (j = 0; j < bound; j++) {
+        int val = 1;
+        uint16x8_t vect_en = vld1q_u16(enable);
+        for (k = j; k < bound && val == 1; k++) {
+            int t = vaddvq_u16(vandq_u16(mtx[j].vect_u16, vect_en));
+            val = (t == bound - j) ? 1 : 0;
+        }
+        res = (val << j) + res;
+        enable[bound - 1 - j] = 0;
+    }
+    return res;
+}
+
+static int cal_res_byte_equal_ordered(__m128i a, int la, __m128i b, int lb)
+{
+    __m128i mtx[16];
+    PCMPSTR_EQ_8x16(a, b, mtx);
+    return aggregate_equal_ordered_8x16(16, la, lb, mtx);
+}
+
+static int cal_res_word_equal_ordered(__m128i a, int la, __m128i b, int lb)
+{
+    __m128i mtx[16];
+    PCMPSTR_EQ_16x8(a, b, mtx);
+    return aggregate_equal_ordered_16x8(8, la, lb, mtx);
+}
+
+typedef enum {
+    CMP_UBYTE_EQUAL_ANY,
+    CMP_UWORD_EQUAL_ANY,
+    CMP_SBYTE_EQUAL_ANY,
+    CMP_SWORD_EQUAL_ANY,
+    CMP_UBYTE_RANGES,
+    CMP_UWORD_RANGES,
+    CMP_SBYTE_RANGES,
+    CMP_SWORD_RANGES,
+    CMP_UBYTE_EQUAL_EACH,
+    CMP_UWORD_EQUAL_EACH,
+    CMP_SBYTE_EQUAL_EACH,
+    CMP_SWORD_EQUAL_EACH,
+    CMP_UBYTE_EQUAL_ORDERED,
+    CMP_UWORD_EQUAL_ORDERED,
+    CMP_SBYTE_EQUAL_ORDERED,
+    CMP_SWORD_EQUAL_ORDERED
+} _MM_CMPESTR_ENUM;
+typedef int (*CMPESTR)(__m128i a, int la, __m128i b, int lb);
+typedef struct {
+    _MM_CMPESTR_ENUM cmpintEnum;
+    CMPESTR cmpFun;
+} CmpestrFuncList;
+static CmpestrFuncList g_CmpestrFuncList[] = {{CMP_UBYTE_EQUAL_ANY, cal_res_byte_equal_any},
+    {CMP_UWORD_EQUAL_ANY, cal_res_word_equal_any},
+    {CMP_SBYTE_EQUAL_ANY, cal_res_byte_equal_any},
+    {CMP_SWORD_EQUAL_ANY, cal_res_word_equal_any},
+    {CMP_UBYTE_RANGES, cal_res_ubyte_ranges},
+    {CMP_UWORD_RANGES, cal_res_uword_ranges},
+    {CMP_SBYTE_RANGES, cal_res_sbyte_ranges},
+    {CMP_SWORD_RANGES, cal_res_sword_ranges},
+    {CMP_UBYTE_EQUAL_EACH, cal_res_byte_equal_each},
+    {CMP_UWORD_EQUAL_EACH, cal_res_word_equal_each},
+    {CMP_SBYTE_EQUAL_EACH, cal_res_byte_equal_each},
+    {CMP_SWORD_EQUAL_EACH, cal_res_word_equal_each},
+    {CMP_UBYTE_EQUAL_ORDERED, cal_res_byte_equal_ordered},
+    {CMP_UWORD_EQUAL_ORDERED, cal_res_word_equal_ordered},
+    {CMP_SBYTE_EQUAL_ORDERED, cal_res_byte_equal_ordered},
+    {CMP_SWORD_EQUAL_ORDERED, cal_res_word_equal_ordered}};
+
+FORCE_INLINE int neg_fun(int res, int lb, int imm8, int bound)
+{
+    int m;
+    switch (imm8 & 0x30) {
+        case _SIDD_NEGATIVE_POLARITY:
+            res ^= 0xffffffff;
+            break;
+        case _SIDD_MASKED_NEGATIVE_POLARITY:
+            m = (1 << lb) - 1;
+            res ^= m;
+            break;
+        default:
+            break;
+    }
+
+    return res & ((bound == 8) ? 0xFF : 0xFFFF);
+}
+FORCE_INLINE int _mm_cmpestri(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    int bound = (imm8 & 0x01) ? 8 : 16;
+    __asm__ __volatile__ (
+        "eor w0, %w[a], %w[a], asr31          \n\t"
+        "sub %w[a], w0, %w[a], asr31          \n\t"
+        "eor w1, %w[b], %w[b], asr31          \n\t"
+        "sub %w[b], w1, %w[b], asr31          \n\t"
+        "cmp %w[a], %w[bd]                  \n\t"
+        "csel %w[a], %w[bd], %w[a], gt      \n\t"
+        "cmp %w[b], %w[bd]                  \n\t"
+        "csel %w[b], %w[bd], %w[b], gt      \n\t"
+        :[a]"+r"(la), [b]"+r"(lb)
+        :[bd]"r"(bound)
+        :"w0", "w1"
+    );
+
+    int r2 = g_CmpestrFuncList[imm8 & 0x0f].cmpFun(a, la, b, lb);
+    r2 = neg_fun(r2, lb, imm8, bound);
+    return (r2 == 0) ? bound : ((imm8 & 0x40) ? (31 - __builtin_clz(r2)) : __builtin_ctz(r2));
+}
+
+FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
+{
+    __m128i dst;
+    int bound = (imm8 & 0x01) ? 8 : 16;
+    __asm__ __volatile__ (
+        "eor w0, %w[a], %w[a], asr31          \n\t"
+        "sub %w[a], w0, %w[a], asr31          \n\t"
+        "eor w1, %w[b], %w[b], asr31          \n\t"
+        "sub %w[b], w1, %w[b], asr31          \n\t"
+        "cmp %w[a], %w[bd]                  \n\t"
+        "csel %w[a], %w[bd], %w[a], gt      \n\t"
+        "cmp %w[b], %w[bd]                  \n\t"
+        "csel %w[b], %w[bd], %w[b], gt      \n\t"
+        :[a]"+r"(la), [b]"+r"(lb)
+        :[bd]"r"(bound)
+        :"w0", "w1"
+    );
+
+    int r2 = g_CmpestrFuncList[imm8 & 0x0f].cmpFun(a, la, b, lb);
+    r2 = neg_fun(r2, lb, imm8, bound);
+
+    dst.vect_u8 = vdupq_n_u8(0);
+    if (imm8 & 0x40) {
+        if (bound == 8) {
+            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), vld1q_u16(g_mask_epi16));
+            dst.vect_u16 = vbslq_u16(tmp, vdupq_n_u16(-1), dst.vect_u16);
+        } else {
+            uint8x16_t vect_r2 = vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));
+            uint8x16_t tmp = vtstq_u8(vect_r2, vld1q_u8(g_mask_epi8));
+            dst.vect_u8 = vbslq_u8(tmp, vdupq_n_u8(-1), dst.vect_u8);
+        }
+    } else {
+        if (bound == 16) {
+            dst.vect_u16 = vsetq_lane_u16(r2 & 0xffff, dst.vect_u16, 0);
+        } else {
+            dst.vect_u8 = vsetq_lane_u8(r2 & 0xff, dst.vect_u8, 0);
+        }
+    }
+
+    return dst;
+}
+
+FORCE_INLINE __m128i _mm_insert_epi32 (__m128i a, int i, const int imm8)
+{
+    assert(imm8 >= 0 && imm8 <= 3);
+    a.vect_s32 = vsetq_lane_s32(i, a.vect_s32, imm8);
+    return a;
+}
diff --git a/tests/Makefile b/tests/Makefile
new file mode 100644
index 0000000..c2f2404
--- /dev/null
+++ b/tests/Makefile
@@ -0,0 +1,31 @@
+# The default platform is aarch64, use "make"
+# If on x86 platform, use "make ARCH=x86"
+ARCH = aarch64
+CXX = g++
+ARCH_CFLAGS = -march=armv8-a+fp+simd+crc
+
+ifeq ($(ARCH), x86)
+	CXX = aarch64-linux-gnu-g++
+endif
+
+
+ABSPATH = $(abspath $(lastword $(MAKEFILE_LIST)))
+ABSDIR = $(dir $(ABSPATH))
+INCLUDE_DIR = $(ABSDIR)../
+DATAFILE_DIR = $(ABSDIR)../data
+
+CXXFLAGS = -Wall -Wcast-qual -DNDEBUG -I $(INCLUDE_DIR) -I $(DATAFILE_DIR) $(ARCH_CFLAGS) -O2
+LDFLAGS	= -lm
+
+OBJS = $(wildcard $(ABSDIR)*.c)
+EXEC = $(ABSDIR)test
+
+$(EXEC): $(OBJS)
+	$(CXX) $(LDFLAGS) $(CXXFLAGS) -o $@ $^
+
+
+
+.PHONY: clean
+clean:
+	$(RM) $(EXEC)
+
diff --git a/tests/a2ntest.c b/tests/a2ntest.c
new file mode 100644
index 0000000..e3dafb6
--- /dev/null
+++ b/tests/a2ntest.c
@@ -0,0 +1,4838 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved.
+ * Description: avx2neon unit test
+ * Author: xuqimeng
+ * Create: 2019-11-05
+ */
+
+#include "a2ntest.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "avx2neontestdata.h"
+
+const int g_256bit_divto_128bit = sizeof(__m256i) / sizeof(__m128i);
+const int g_512bit_divto_128bit = sizeof(__m512i) / sizeof(__m128i);
+
+const unsigned int M256I_M128I_NUM = 2U;
+const unsigned int M256_M128_NUM = 2U;
+const unsigned int M256D_M128D_NUM = 2U;
+
+const unsigned int M512I_M128I_NUM = 4U;
+const unsigned int M512_M128_NUM = 4U;
+const unsigned int M512D_M128D_NUM = 4U;
+
+const unsigned int M128I_INT8_NUM = 16U;
+const unsigned int M128I_INT16_NUM = 8U;
+const unsigned int M128I_INT32_NUM = 4U;
+const unsigned int M128I_INT64_NUM = 2U;
+
+const unsigned int M128_FLOAT32_NUM = 4U;
+const unsigned int M128D_FLOAT64_NUM = 2U;
+
+#define ASSERT_RETURN(x)                                                                                               \
+    if (!(x))                                                                                                          \
+        return FALSE;
+
+#define MM256_CMP_PD(imm, rel, e)                                                                                      \
+    do {                                                                                                               \
+        for (i = 0; i < 4; i++)                                                                                        \
+            e[i] = ((rel) != 0) ? -1 : 0;                                                                              \
+        source1 = s1;                                                                                                  \
+        source2 = s2;                                                                                                  \
+        dest = _mm256_cmp_pd(source1, source2, imm);                                                                   \
+        if (!comp_return(&dest, &e, 4 * sizeof(long long))) {                                                          \
+            return FALSE;                                                                                              \
+        }                                                                                                              \
+    } while (0)
+
+#define MM256_CMP_PS(imm, rel, e)                                                                                      \
+    do {                                                                                                               \
+        for (i = 0; i < 8; i++)                                                                                        \
+            e[i] = ((rel) != 0) ? -1 : 0;                                                                              \
+        source1 = s1;                                                                                                  \
+        source2 = s2;                                                                                                  \
+        dest = _mm256_cmp_ps(source1, source2, imm);                                                                   \
+        if (!comp_return(&dest, &e, 8 * sizeof(int))) {                                                                \
+            return FALSE;                                                                                              \
+        }                                                                                                              \
+    } while (0)
+
+int comp_return(const void *src, const void *dst, const unsigned long len)
+{
+    return (0 == memcmp(src, dst, len) ? TRUE : FALSE);
+}
+
+const char *RunTest(InstructionTest test, int *flag)
+{
+    const char *ret = "UNKNOWN!";
+    switch (test) {
+        case UT_MM_POPCNT_U32:
+            ret = "MM_POPCNT_U32";
+            *flag = test_mm_popcnt_u32();
+            break;
+        case UT_MM_POPCNT_U64:
+            ret = "MM_POPCNT_U64";
+            *flag = test_mm_popcnt_u64();
+            break;
+        case UT_MM256_DIV_EPI16:
+            ret = "MM256_DIV_EPI16";
+            *flag = test_mm256_div_epi16();
+            break;
+        case UT_MM256_DIV_EPI32:
+            ret = "MM256_DIV_EPI32";
+            *flag = test_mm256_div_epi32();
+            break;
+        case UT_MM256_DIV_EPI64:
+            ret = "MM256_DIV_EPI64";
+            *flag = test_mm256_div_epi64();
+            break;
+        case UT_MM256_DIV_EPI8:
+            ret = "MM256_DIV_EPI8";
+            *flag = test_mm256_div_epi8();
+            break;
+        case UT_MM256_DIV_EPU16:
+            ret = "MM256_DIV_EPU16";
+            *flag = test_mm256_div_epu16();
+            break;
+        case UT_MM256_DIV_EPU32:
+            ret = "MM256_DIV_EPU32";
+            *flag = test_mm256_div_epu32();
+            break;
+        case UT_MM256_DIV_EPU64:
+            ret = "MM256_DIV_EPU64";
+            *flag = test_mm256_div_epu64();
+            break;
+        case UT_MM256_DIV_EPU8:
+            ret = "MM256_DIV_EPU8";
+            *flag = test_mm256_div_epu8();
+            break;
+        case UT_MM256_DIV_PD:
+            ret = "MM256_DIV_PD";
+            *flag = test_mm256_div_pd();
+            break;
+        case UT_MM256_DIV_PS:
+            ret = "MM256_DIV_PS";
+            *flag = test_mm256_div_ps();
+            break;
+        case UT_MM512_DIV_EPI16:
+            ret = "MM512_DIV_EPI16";
+            *flag = test_mm512_div_epi16();
+            break;
+        case UT_MM512_DIV_EPI32:
+            ret = "MM512_DIV_EPI32";
+            *flag = test_mm512_div_epi32();
+            break;
+        case UT_MM512_DIV_EPI64:
+            ret = "MM512_DIV_EPI64";
+            *flag = test_mm512_div_epi64();
+            break;
+        case UT_MM512_DIV_EPI8:
+            ret = "MM512_DIV_EPI8";
+            *flag = test_mm512_div_epi8();
+            break;
+        case UT_MM512_DIV_EPU16:
+            ret = "MM512_DIV_EPU16";
+            *flag = test_mm512_div_epu16();
+            break;
+        case UT_MM512_DIV_EPU32:
+            ret = "MM512_DIV_EPU32";
+            *flag = test_mm512_div_epu32();
+            break;
+        case UT_MM512_DIV_EPU64:
+            ret = "MM512_DIV_EPU64";
+            *flag = test_mm512_div_epu64();
+            break;
+        case UT_MM512_DIV_EPU8:
+            ret = "MM512_DIV_EPU8";
+            *flag = test_mm512_div_epu8();
+            break;
+        case UT_MM512_DIV_PD:
+            ret = "MM512_DIV_PD";
+            *flag = test_mm512_div_pd();
+            break;
+        case UT_MM512_DIV_PS:
+            ret = "MM512_DIV_PS";
+            *flag = test_mm512_div_ps();
+            break;
+        case UT_MM512_DIV_ROUND_PD:
+            ret = "MM512_DIV_ROUND_PD";
+            *flag = test_mm512_div_round_pd();
+            break;
+        case UT_MM512_DIV_ROUND_PS:
+            ret = "MM512_DIV_ROUND_PS";
+            *flag = test_mm512_div_round_ps();
+            break;
+        case UT_MM256_ADD_EPI8:
+            ret = "MM256_ADD_EPI8";
+            *flag = test_mm256_add_epi8();
+            break;
+        case UT_MM256_ADD_EPI16:
+            ret = "MM256_ADD_EPI16";
+            *flag = test_mm256_add_epi16();
+            break;
+        case UT_MM256_ADD_EPI32:
+            ret = "MM256_ADD_EPI32";
+            *flag = test_mm256_add_epi32();
+            break;
+        case UT_MM256_ADD_EPI64:
+            ret = "MM256_ADD_EPI64";
+            *flag = test_mm256_add_epi64();
+            break;
+        case UT_MM512_ADD_EPI8:
+            ret = "MM512_ADD_EPI8";
+            *flag = test_mm512_add_epi8();
+            break;
+        case UT_MM512_ADD_EPI16:
+            ret = "MM512_ADD_EPI16";
+            *flag = test_mm512_add_epi16();
+            break;
+        case UT_MM512_ADD_EPI32:
+            ret = "MM512_ADD_EPI32";
+            *flag = test_mm512_add_epi32();
+            break;
+        case UT_MM512_ADD_EPI64:
+            ret = "MM512_ADD_EPI64";
+            *flag = test_mm512_add_epi64();
+            break;
+        case UT_MM256_ADDS_EPI8:
+            ret = "MM256_ADDS_EPI8";
+            *flag = test_mm256_adds_epi8();
+            break;
+        case UT_MM256_ADDS_EPI16:
+            ret = "MM256_ADDS_EPI16";
+            *flag = test_mm256_adds_epi16();
+            break;
+        case UT_MM256_ADDS_EPU8:
+            ret = "MM256_ADDS_EPU8";
+            *flag = test_mm256_adds_epu8();
+            break;
+        case UT_MM256_ADDS_EPU16:
+            ret = "MM256_ADDS_EPU16";
+            *flag = test_mm256_adds_epu16();
+            break;
+        case UT_MM512_ADDS_EPI8:
+            ret = "MM512_ADDS_EPI8";
+            *flag = test_mm512_adds_epi8();
+            break;
+        case UT_MM512_ADDS_EPI16:
+            ret = "MM512_ADDS_EPI16";
+            *flag = test_mm512_adds_epi16();
+            break;
+        case UT_MM512_ADDS_EPU8:
+            ret = "MM512_ADDS_EPU8";
+            *flag = test_mm512_adds_epu8();
+            break;
+        case UT_MM512_ADDS_EPU16:
+            ret = "MM512_ADDS_EPU16";
+            *flag = test_mm512_adds_epu16();
+            break;
+        case UT_MM256_ADD_PS:
+            ret = "MM256_ADD_PS";
+            *flag = test_mm256_add_ps();
+            break;
+        case UT_MM256_ADD_PD:
+            ret = "MM256_ADD_PD";
+            *flag = test_mm256_add_pd();
+            break;
+        case UT_MM512_ADD_PS:
+            ret = "MM512_ADD_PS";
+            *flag = test_mm512_add_ps();
+            break;
+        case UT_MM512_ADD_PD:
+            ret = "MM512_ADD_PD";
+            *flag = test_mm512_add_pd();
+            break;
+        case UT_MM512_ADD_ROUND_PS:
+            ret = "MM512_ADD_ROUND_PS";
+            *flag = test_mm512_add_round_ps();
+            break;
+        case UT_MM512_ADD_ROUND_PD:
+            ret = "MM512_ADD_ROUND_PD";
+            *flag = test_mm512_add_round_pd();
+            break;
+        case UT_MM512_ADDN_PS:
+            ret = "MM512_ADDN_PS";
+            *flag = test_mm512_addn_ps();
+            break;
+        case UT_MM512_ADDN_PD:
+            ret = "MM512_ADDN_PD";
+            *flag = test_mm512_addn_pd();
+            break;
+        case UT_MM512_ADDN_ROUND_PS:
+            ret = "MM512_ADDN_ROUND_PS";
+            *flag = test_mm512_addn_round_ps();
+            break;
+        case UT_MM512_ADDN_ROUND_PD:
+            ret = "MM512_ADDN_ROUND_PD";
+            *flag = test_mm512_addn_round_pd();
+            break;
+        case UT_MM512_ADDSETC_EPI32:
+            ret = "MM512_ADDSETC_EPI32";
+            *flag = test_mm512_addsetc_epi32();
+            break;
+        case UT_MM512_ADDSETS_EPI32:
+            ret = "MM512_ADDSETS_EPI32";
+            *flag = test_mm512_addsets_epi32();
+            break;
+        case UT_MM512_ADDSETS_PS:
+            ret = "MM512_ADDSETS_PS";
+            *flag = test_mm512_addsets_ps();
+            break;
+        case UT_MM512_ADDSETS_ROUND_PS:
+            ret = "MM512_ADDSETS_ROUND_PS";
+            *flag = test_mm512_addsets_round_ps();
+            break;
+        case UT_MM256_ADDSUB_PS:
+            ret = "MM256_ADDSUB_PS";
+            *flag = test_mm256_addsub_ps();
+            break;
+        case UT_MM256_ADDSUB_PD:
+            ret = "MM256_ADDSUB_PD";
+            *flag = test_mm256_addsub_pd();
+            break;
+        case UT_MM256_SUB_EPI16:
+            ret = "MM256_SUB_EPI16";
+            *flag = test_mm256_sub_epi16();
+            break;
+        case UT_MM256_SUB_EPI32:
+            ret = "MM256_SUB_EPI32";
+            *flag = test_mm256_sub_epi32();
+            break;
+        case UT_MM256_SUB_EPI64:
+            ret = "MM256_SUB_EPI64";
+            *flag = test_mm256_sub_epi64();
+            break;
+        case UT_MM256_SUB_EPI8:
+            ret = "MM256_SUB_EPI8";
+            *flag = test_mm256_sub_epi8();
+            break;
+        case UT_MM256_SUB_PD:
+            ret = "MM256_SUB_PD";
+            *flag = test_mm256_sub_pd();
+            break;
+        case UT_MM256_SUB_PS:
+            ret = "MM256_SUB_PS";
+            *flag = test_mm256_sub_ps();
+            break;
+        case UT_MM512_SUB_EPI16:
+            ret = "MM512_SUB_EPI16";
+            *flag = test_mm512_sub_epi16();
+            break;
+        case UT_MM512_SUB_EPI32:
+            ret = "MM512_SUB_EPI32";
+            *flag = test_mm512_sub_epi32();
+            break;
+        case UT_MM512_SUB_EPI64:
+            ret = "MM512_SUB_EPI64";
+            *flag = test_mm512_sub_epi64();
+            break;
+        case UT_MM512_SUB_EPI8:
+            ret = "MM512_SUB_EPI8";
+            *flag = test_mm512_sub_epi8();
+            break;
+        case UT_MM512_SUB_PD:
+            ret = "MM512_SUB_PD";
+            *flag = test_mm512_sub_pd();
+            break;
+        case UT_MM512_SUB_PS:
+            ret = "MM512_SUB_PS";
+            *flag = test_mm512_sub_ps();
+            break;
+        case UT_MM256_SUBS_EPI16:
+            ret = "MM256_SUBS_EPI16";
+            *flag = test_mm256_subs_epi16();
+            break;
+        case UT_MM256_SUBS_EPI8:
+            ret = "MM256_SUBS_EPI8";
+            *flag = test_mm256_subs_epi8();
+            break;
+        case UT_MM256_SUBS_EPU16:
+            ret = "MM256_SUBS_EPU16";
+            *flag = test_mm256_subs_epu16();
+            break;
+        case UT_MM256_SUBS_EPU8:
+            ret = "MM256_SUBS_EPU8";
+            *flag = test_mm256_subs_epu8();
+            break;
+        case UT_MM512_SUBS_EPI16:
+            ret = "MM512_SUBS_EPI16";
+            *flag = test_mm512_subs_epi16();
+            break;
+        case UT_MM512_SUBS_EPI8:
+            ret = "MM512_SUBS_EPI8";
+            *flag = test_mm512_subs_epi8();
+            break;
+        case UT_MM512_SUBS_EPU16:
+            ret = "MM512_SUBS_EPU16";
+            *flag = test_mm512_subs_epu16();
+            break;
+        case UT_MM512_SUBS_EPU8:
+            ret = "MM512_SUBS_EPU8";
+            *flag = test_mm512_subs_epu8();
+            break;
+        case UT_MM512_SUB_ROUND_PD:
+            ret = "MM512_SUB_ROUND_PD";
+            *flag = test_mm512_sub_round_pd();
+            break;
+        case UT_MM512_SUB_ROUND_PS:
+            ret = "MM512_SUB_ROUND_PS";
+            *flag = test_mm512_sub_round_ps();
+            break;
+        case UT_MM512_SUBR_EPI32:
+            ret = "MM512_SUBR_EPI32";
+            *flag = test_mm512_subr_epi32();
+            break;
+        case UT_MM512_SUBR_PS:
+            ret = "MM512_SUBR_PS";
+            *flag = test_mm512_subr_ps();
+            break;
+        case UT_MM512_SUBR_PD:
+            ret = "MM512_SUBR_PD";
+            *flag = test_mm512_subr_pd();
+            break;
+        case UT_MM512_SUBR_ROUND_PS:
+            ret = "MM512_SUBR_ROUND_PS";
+            *flag = test_mm512_subr_round_ps();
+            break;
+        case UT_MM512_SUBR_ROUND_PD:
+            ret = "MM512_SUBR_ROUND_PD";
+            *flag = test_mm512_subr_round_pd();
+            break;
+        case UT_MM512_SUBSETB_EPI32:
+            ret = "MM512_SUBSETB_EPI32";
+            *flag = test_mm512_subsetb_epi32();
+            break;
+        case UT_MM512_SUBRSETB_EPI32:
+            ret = "MM512_SUBRSETB_EPI32";
+            *flag = test_mm512_subrsetb_epi32();
+            break;
+        case UT_MM256_ZEROUPPER:
+            ret = "MM256_ZEROUPPER";
+            *flag = test_mm256_zeroupper();
+            break;
+        case UT_MM512_BSLLI_EPI128:
+            ret = "MM512_BSLLI_EPI128";
+            *flag = test_mm512_bslli_epi128();
+            break;
+        case UT_MM512_BSRLI_EPI128:
+            ret = "MM512_BSRLI_EPI128";
+            *flag = test_mm512_bsrli_epi128();
+            break;
+        case UT_MM512_PERMUTEXVAR_EPI64:
+            ret = "MM512_PERMUTEXVAR_EPI64";
+            *flag = test_mm512_permutexvar_epi64();
+            break;
+        case UT_MM512_EXTRACTI32X4_EPI32:
+            ret = "MM512_EXTRACTI32X4_EPI32";
+            *flag = test_mm512_extracti32x4_epi32();
+            break;
+        case UT_MM512_TEST_EPI8_MASK:
+            ret = "MM512_TEST_EPI8_MASK";
+            *flag = test_mm512_test_epi8_mask();
+            break;
+        case UT_MM512_TEST_EPI32_MASK:
+            ret = "MM512_TEST_EPI32_MASK";
+            *flag = test_mm512_test_epi32_mask();
+            break;
+        case UT_MM512_TEST_EPI64_MASK:
+            ret = "MM512_TEST_EPI64_MASK";
+            *flag = test_mm512_test_epi64_mask();
+            break;
+        case UT_MM256_MUL_EPI32:
+            ret = "MM256_MUL_EPI32";
+            *flag = test_mm256_mul_epi32();
+            break;
+        case UT_MM256_MUL_EPU32:
+            ret = "MM256_MUL_EPU32";
+            *flag = test_mm256_mul_epu32();
+            break;
+        case UT_MM256_MUL_PD:
+            ret = "MM256_MUL_PD";
+            *flag = test_mm256_mul_pd();
+            break;
+        case UT_MM256_MUL_PS:
+            ret = "MM256_MUL_PS";
+            *flag = test_mm256_mul_ps();
+            break;
+        case UT_MM512_MUL_EPI32:
+            ret = "MM512_MUL_EPI32";
+            *flag = test_mm512_mul_epi32();
+            break;
+        case UT_MM512_MUL_EPU32:
+            ret = "MM512_MUL_EPU32";
+            *flag = test_mm512_mul_epu32();
+            break;
+        case UT_MM512_MUL_PD:
+            ret = "MM512_MUL_PD";
+            *flag = test_mm512_mul_pd();
+            break;
+        case UT_MM512_MUL_PS:
+            ret = "MM512_MUL_PS";
+            *flag = test_mm512_mul_ps();
+            break;
+        case UT_MM256_MULHI_EPI16:
+            ret = "MM256_MULHI_EPI16";
+            *flag = test_mm256_mulhi_epi16();
+            break;
+        case UT_MM256_MULHI_EPU16:
+            ret = "MM256_MULHI_EPU16";
+            *flag = test_mm256_mulhi_epu16();
+            break;
+        case UT_MM512_MULHI_EPI16:
+            ret = "MM512_MULHI_EPI16";
+            *flag = test_mm512_mulhi_epi16();
+            break;
+        case UT_MM512_MULHI_EPU16:
+            ret = "MM512_MULHI_EPU16";
+            *flag = test_mm512_mulhi_epu16();
+            break;
+        case UT_MM512_MULHI_EPI32:
+            ret = "MM512_MULHI_EPI32";
+            *flag = test_mm512_mulhi_epi32();
+            break;
+        case UT_MM512_MULHI_EPU32:
+            ret = "MM512_MULHI_EPU32";
+            *flag = test_mm512_mulhi_epu32();
+            break;
+        case UT_MM256_MULLO_EPI16:
+            ret = "MM256_MULLO_EPI16";
+            *flag = test_mm256_mullo_epi16();
+            break;
+        case UT_MM256_MULLO_EPI32:
+            ret = "MM256_MULLO_EPI32";
+            *flag = test_mm256_mullo_epi32();
+            break;
+        case UT_MM256_MULLO_EPI64:
+            ret = "MM256_MULLO_EPI64";
+            *flag = test_mm256_mullo_epi64();
+            break;
+        case UT_MM512_MULLO_EPI16:
+            ret = "MM512_MULLO_EPI16";
+            *flag = test_mm512_mullo_epi16();
+            break;
+        case UT_MM512_MULLO_EPI32:
+            ret = "MM512_MULLO_EPI32";
+            *flag = test_mm512_mullo_epi32();
+            break;
+        case UT_MM512_MULLO_EPI64:
+            ret = "MM512_MULLO_EPI64";
+            *flag = test_mm512_mullo_epi64();
+            break;
+        case UT_MM512_MULLOX_EPI64:
+            ret = "MM512_MULLOX_EPI64";
+            *flag = test_mm512_mullox_epi64();
+            break;
+        case UT_MM256_MULHRS_EPI16:
+            ret = "MM256_MULHRS_EPI16";
+            *flag = test_mm256_mulhrs_epi16();
+            break;
+        case UT_MM512_MULHRS_EPI16:
+            ret = "MM512_MULHRS_EPI16";
+            *flag = test_mm512_mulhrs_epi16();
+            break;
+        case UT_MM512_MUL_ROUND_PD:
+            ret = "MM512_MUL_ROUND_PD";
+            *flag = test_mm512_mul_round_pd();
+            break;
+        case UT_MM512_MUL_ROUND_PS:
+            ret = "MM512_MUL_ROUND_PS";
+            *flag = test_mm512_mul_round_ps();
+            break;
+        case UT_MM_SLL_EPI64:
+            ret = "MM_SLL_EPI64";
+            *flag = test_mm_sll_epi64();
+            break;
+        case UT_MM256_SLL_EPI32:
+            ret = "MM256_SLL_EPI32";
+            *flag = test_mm256_sll_epi32();
+            break;
+        case UT_MM256_SLL_EPI64:
+            ret = "MM256_SLL_EPI64";
+            *flag = test_mm256_sll_epi64();
+            break;
+        case UT_MM256_SLLI_EPI64:
+            ret = "MM256_SLLI_EPI64";
+            *flag = test_mm256_slli_epi64();
+            break;
+        case UT_MM256_SLLI_EPI32:
+            ret = "MM256_SLLI_EPI32";
+            *flag = test_mm256_slli_epi32();
+            break;
+        case UT_MM256_SRLI_EPI64:
+            ret = "MM256_SRLI_EPI64";
+            *flag = test_mm256_srli_epi64();
+            break;
+        case UT_MM512_SLL_EPI64:
+            ret = "MM512_SLL_EPI64";
+            *flag = test_mm512_sll_epi64();
+            break;
+        case UT_MM512_SLLI_EPI64:
+            ret = "MM512_SLLI_EPI64";
+            *flag = test_mm512_slli_epi64();
+            break;
+        case UT_MM512_SRLI_EPI64:
+            ret = "MM512_SRLI_EPI64";
+            *flag = test_mm512_srli_epi64();
+            break;
+        case UT_MM256_SLLI_SI256:
+            ret = "MM256_SLLI_SI256";
+            *flag = test_mm256_slli_si256();
+            break;
+        case UT_MM256_SRLI_SI256:
+            ret = "MM256_SRLI_SI256";
+            *flag = test_mm256_srli_si256();
+            break;
+        case UT_MM256_BLENDV_PD:
+            ret = "MM256_BLENDV_PD";
+            *flag = test_mm256_blendv_pd();
+            break;
+        case UT_MM256_BLENDV_PS:
+            ret = "MM256_BLENDV_PS";
+            *flag = test_mm256_blendv_ps();
+            break;
+        case UT_MM256_BLEND_PD:
+            ret = "MM256_BLEND_PD";
+            *flag = test_mm256_blend_pd();
+            break;
+        case UT_MM256_BLEND_PS:
+            ret = "MM256_BLEND_PS";
+            *flag = test_mm256_blend_ps();
+            break;
+        case UT_MM512_MASK_BLEND_PD:
+            ret = "MM512_MASK_BLEND_PD";
+            *flag = test_mm512_mask_blend_pd();
+            break;
+        case UT_MM512_MASK_BLEND_PS:
+            ret = "MM512_MASK_BLEND_PS";
+            *flag = test_mm512_mask_blend_ps();
+            break;
+        case UT_MM256_AND_SI256:
+            ret = "MM256_AND_SI256";
+            *flag = test_mm256_and_si256();
+            break;
+        case UT_MM512_AND_SI512:
+            ret = "MM512_AND_SI512";
+            *flag = test_mm512_and_si512();
+            break;
+        case UT_MM256_OR_SI256:
+            ret = "MM256_OR_SI256";
+            *flag = test_mm256_or_si256();
+            break;
+        case UT_MM512_OR_SI512:
+            ret = "MM512_OR_SI512";
+            *flag = test_mm512_or_si512();
+            break;
+        case UT_MM256_ANDNOT_SI256:
+            ret = "MM256_ANDNOT_SI256";
+            *flag = test_mm256_andnot_si256();
+            break;
+        case UT_MM512_ANDNOT_SI512:
+            ret = "MM512_ANDNOT_SI512";
+            *flag = test_mm512_andnot_si512();
+            break;
+        case UT_MM256_XOR_SI256:
+            ret = "MM256_XOR_SI256";
+            *flag = test_mm256_xor_si256();
+            break;
+        case UT_MM512_XOR_SI512:
+            ret = "MM512_XOR_SI512";
+            *flag = test_mm512_xor_si512();
+            break;
+        case UT_MM256_OR_PS:
+            ret = "MM256_OR_PS";
+            *flag = test_mm256_or_ps();
+            break;
+        case UT_MM256_OR_PD:
+            ret = "MM256_OR_PS";
+            *flag = test_mm256_or_pd();
+            break;
+        case UT_MM512_AND_EPI32:
+            ret = "MM512_AND_EPI32";
+            *flag = test_mm512_and_epi32();
+            break;
+        case UT_MM512_AND_EPI64:
+            ret = "MM512_AND_EPI64";
+            *flag = test_mm512_and_epi64();
+            break;
+        case UT_MM512_OR_EPI32:
+            ret = "MM512_OR_EPI32";
+            *flag = test_mm512_or_epi32();
+            break;
+        case UT_MM512_OR_EPI64:
+            ret = "MM512_OR_EPI64";
+            *flag = test_mm512_or_epi64();
+            break;
+        case UT_MM512_XOR_PS:
+            ret = "MM512_XOR_PS";
+            *flag = test_mm512_xor_ps();
+            break;
+        case UT_MM512_XOR_PD:
+            ret = "MM512_XOR_PD";
+            *flag = test_mm512_xor_pd();
+            break;
+        case UT_MM256_CMPEQ_EPI8:
+            ret = "MM256_CMPEQ_EPI8";
+            *flag = test_mm256_cmpeq_epi8();
+            break;
+        case UT_MM256_CMPEQ_EPI32:
+            ret = "MM256_CMPEQ_EPI32";
+            *flag = test_mm256_cmpeq_epi32();
+            break;
+        case UT_MM_CMPEQ_EPI64:
+            ret = "MM_CMPEQ_EPI64";
+            *flag = test_mm_cmpeq_epi64();
+            break;
+        case UT_MM512_CMP_EPI32_MASK:
+            ret = "MM512_CMP_EPI32_MASK";
+            *flag = test_mm512_cmp_epi32_mask();
+            break;
+        case UT_MM512_CMP_EPI8_MASK:
+            ret = "MM512_CMP_EPI8_MASK";
+            *flag = test_mm512_cmp_epi8_mask();
+            break;
+        case UT_MM512_CMPEQ_EPI8_MASK:
+            ret = "MM512_CMPEQ_EPI8_MASK";
+            *flag = test_mm512_cmpeq_epi8_mask();
+            break;
+        case UT_MM512_MASK_CMPEQ_EPI8_MASK:
+            ret = "MM512_MASK_CMPEQ_EPI8_MASK";
+            *flag = test_mm512_mask_cmpeq_epi8_mask();
+            break;
+        case UT_MM512_SET_EPI32:
+            ret = "MM512_SET_EPI32";
+            *flag = test_mm512_set_epi32();
+            break;
+        case UT_MM512_SET_EPI64:
+            ret = "MM512_SET_EPI64";
+            *flag = test_mm512_set_epi64();
+            break;
+        case UT_MM512_SET1_EPI32:
+            ret = "MM512_SET1_EPI32";
+            *flag = test_mm512_set1_epi32();
+            break;
+        case UT_MM512_SET1_EPI64:
+            ret = "MM512_SET1_EPI64";
+            *flag = test_mm512_set1_epi64();
+            break;
+        case UT_MM512_SET1_EPI8:
+            ret = "MM512_SET1_EPI8";
+            *flag = test_mm512_set1_epi8();
+            break;
+        case UT_MM512_SET_PS:
+            ret = "MM512_SET_PS";
+            *flag = test_mm512_set_ps();
+            break;
+        case UT_MM512_SET_PD:
+            ret = "MM512_SET_PD";
+            *flag = test_mm512_set_pd();
+            break;
+        case UT_MM512_SET1_PS:
+            ret = "MM512_SET1_PS";
+            *flag = test_mm512_set1_ps();
+            break;
+        case UT_MM512_SET1_PD:
+            ret = "MM512_SET1_PD";
+            *flag = test_mm512_set1_pd();
+            break;
+        case UT_MM512_SETZERO_PS:
+            ret = "MM512_SETZERO_PS";
+            *flag = test_mm512_setzero_ps();
+            break;
+        case UT_MM512_SETZERO_PD:
+            ret = "MM512_SETZERO_PD";
+            *flag = test_mm512_setzero_pd();
+            break;
+        case UT_MM_MOVE_SD:
+            ret = "MM_MOVE_SD";
+            *flag = test_mm_move_sd();
+            break;
+        case UT_MM_MOVE_SS:
+            ret = "MM_MOVE_SS";
+            *flag = test_mm_move_ss();
+            break;
+        case UT_MM256_MOVEMASK_EPI8:
+            ret = "MM256_MOVEMASK_EPI8";
+            *flag = test_mm256_movemask_epi8();
+            break;
+        case UT_MM256_MOVEMASK_PS:
+            ret = "MM256_MOVEMASK_PS";
+            *flag = test_mm256_movemask_ps();
+            break;
+        case UT_MM_TESTZ_SI128:
+            ret = "MM_TESTZ_SI128";
+            *flag = test_mm_testz_si128();
+            break;
+        case UT_MM256_TESTZ_SI256:
+            ret = "MM256_TESTZ_SI256";
+            *flag = test_mm256_testz_si256();
+            break;
+        case UT_MM512_MOVM_EPI8:
+            ret = "MM512_MOVM_EPI8";
+            *flag = test_mm512_movm_epi8();
+            break;
+        case UT_MM_EXTRACT_EPI32:
+            ret = "MM_EXTRACT_EPI32";
+            *flag = test_mm_extract_epi32();
+            break;
+        case UT_MM_EXTRACT_EPI64:
+            ret = "MM_EXTRACT_EPI64";
+            *flag = test_mm_extract_epi64();
+            break;
+        case UT_MM256_EXTRACTI128_SI256:
+            ret = "MM256_EXTRACTI128_SI256";
+            *flag = test_mm256_extracti128_si256();
+            break;
+        case UT_MM_EXTRACT_PS:
+            ret = "MM_EXTRACT_PS";
+            *flag = test_mm_extract_ps();
+            break;
+        case UT_MM256_EXTRACT_EPI32:
+            ret = "MM256_EXTRACT_EPI32";
+            *flag = test_mm256_extract_epi32();
+            break;
+        case UT_MM256_EXTRACT_EPI64:
+            ret = "MM256_EXTRACT_EPI64";
+            *flag = test_mm256_extract_epi64();
+            break;
+        case UT_MM256_EXTRACTF128_PS:
+            ret = "MM256_EXTRACTF128_PS";
+            *flag = test_mm256_extractf128_ps();
+            break;
+        case UT_MM256_EXTRACTF128_PD:
+            ret = "MM256_EXTRACTF128_PD";
+            *flag = test_mm256_extractf128_pd();
+            break;
+        case UT_MM512_EXTRACTF32x8_PS:
+            ret = "MM512_EXTRACTF32x8_PS";
+            *flag = test_mm512_extractf32x8_ps();
+            break;
+        case UT_MM512_EXTRACTF64x4_PD:
+            ret = "MM512_EXTRACTF64x4_PD";
+            *flag = test_mm512_extractf64x4_pd();
+            break;
+        case UT_MM_CRC32_U8:
+            ret = "MM_CRC32_U8";
+            *flag = test_mm_crc32_u8();
+            break;
+        case UT_MM_CRC32_U16:
+            ret = "MM_CRC32_U16";
+            *flag = test_mm_crc32_u16();
+            break;
+        case UT_MM_CRC32_U32:
+            ret = "MM_CRC32_U32";
+            *flag = test_mm_crc32_u32();
+            break;
+        case UT_MM_CRC32_U64:
+            ret = "MM_CRC32_U64";
+            *flag = test_mm_crc32_u64();
+            break;
+        case UT_MM256_UNPACKLO_EPI8:
+            ret = "MM256_UNPACKLO_EPI8";
+            *flag = test_mm256_unpacklo_epi8();
+            break;
+        case UT_MM256_UNPACKHI_EPI8:
+            ret = "MM256_UNPACKHI_EPI8";
+            *flag = test_mm256_unpackhi_epi8();
+            break;
+        case UT_MM512_UNPACKLO_EPI8:
+            ret = "MM512_UNPACKLO_EPI8";
+            *flag = test_mm512_unpacklo_epi8();
+            break;
+        case UT_MM512_UNPACKHI_EPI8:
+            ret = "MM512_UNPACKHI_EPI8";
+            *flag = test_mm512_unpackhi_epi8();
+            break;
+
+        case UT_MM256_STORE_SI256:
+            ret = "MM256_STORE_SI256";
+            *flag = test_mm256_store_si256();
+            break;
+        case UT_MM256_STOREU_SI256:
+            ret = "MM256_STOREU_SI256";
+            *flag = test_mm256_storeu_si256();
+            break;
+        case UT_MM512_STORE_SI512:
+            ret = "MM512_STORE_SI512";
+            *flag = test_mm512_store_si512();
+            break;
+        case UT_MM256_INSERTI128_SI256:
+            ret = "MM256_INSERTI128_SI256";
+            *flag = test_mm256_inserti128_si256();
+            break;
+        case UT_MM256_INSERTF128_PD:
+            ret = "MM256_INSERTF128_PD";
+            *flag = test_mm256_insertf128_pd();
+            break;
+        case UT_MM256_INSERTF128_PS:
+            ret = "MM256_INSERTF128_PS";
+            *flag = test_mm256_insertf128_ps();
+            break;
+        case UT_MM256_PERMUTE4X64_EPI64:
+            ret = "MM256_PERMUTE4X64_EPI64";
+            *flag = test_mm256_permute4x64_epi64();
+            break;
+        case UT_MM_SET_PD:
+            ret = "MM_SET_PD";
+            *flag = test_mm_set_pd();
+            break;
+        case UT_MM256_SET_EPI32:
+            ret = "MM256_SET_EPI32";
+            *flag = test_mm256_set_epi32();
+            break;
+        case UT_MM256_SET_EPI64X:
+            ret = "MM256_SET_EPI64X";
+            *flag = test_mm256_set_epi64x();
+            break;
+        case UT_MM256_SET_M128I:
+            ret = "MM256_SET_M128I";
+            *flag = test_mm256_set_m128i();
+            break;
+        case UT_MM256_SET_PS:
+            ret = "MM256_SET_PS";
+            *flag = test_mm256_set_ps();
+            break;
+        case UT_MM256_SET_PD:
+            ret = "MM256_SET_PD";
+            *flag = test_mm256_set_pd();
+            break;
+        case UT_MM256_SETZERO_SI256:
+            ret = "MM256_SETZERO_SI256";
+            *flag = test_mm256_setzero_si256();
+            break;
+        case UT_MM256_SETZERO_PS:
+            ret = "MM256_SETZERO_PS";
+            *flag = test_mm256_setzero_ps();
+            break;
+        case UT_MM256_SETZERO_PD:
+            ret = "MM256_SETZERO_PD";
+            *flag = test_mm256_setzero_pd();
+            break;
+        case UT_MM_SET1_EPI64X:
+            ret = "MM_SET1_EPI64X";
+            *flag = test_mm_set1_epi64x();
+            break;
+        case UT_MM_SET1_PD:
+            ret = "MM_SET1_PD";
+            *flag = test_mm_set1_pd();
+            break;
+        case UT_MM256_SET1_EPI8:
+            ret = "MM256_SET1_EPI8";
+            *flag = test_mm256_set1_epi8();
+            break;
+        case UT_MM256_SET1_EPI32:
+            ret = "MM256_SET1_EPI32";
+            *flag = test_mm256_set1_epi32();
+            break;
+        case UT_MM256_SET1_EPI64X:
+            ret = "MM256_SET1_EPI64X";
+            *flag = test_mm256_set1_epi64x();
+            break;
+        case UT_MM256_SET1_PD:
+            ret = "MM256_SET1_PD";
+            *flag = test_mm256_set1_pd();
+            break;
+        case UT_MM256_SET1_PS:
+            ret = "MM256_SET1_PS";
+            *flag = test_mm256_set1_ps();
+            break;
+        case UT_MM256_LOAD_SI256:
+            ret = "MM256_LOAD_SI256";
+            *flag = test_mm256_load_si256();
+            break;
+        case UT_MM256_LOADU_SI256:
+            ret = "MM256_LOADU_SI256";
+            *flag = test_mm256_loadu_si256();
+            break;
+        case UT_MM256_MASKLOAD_EPI32:
+            ret = "MM256_MASKLOAD_EPI32";
+            *flag = test_mm256_maskload_epi32();
+            break;
+        case UT_MM512_LOAD_SI512:
+            ret = "MM512_LOAD_SI512";
+            *flag = test_mm512_load_si512();
+            break;
+        case UT_MM512_LOADU_SI512:
+            ret = "MM512_LOADU_SI512";
+            *flag = test_mm512_loadu_si512();
+            break;
+        case UT_MM512_MASK_LOADU_EPI8:
+            ret = "MM512_MASK_LOADU_EPI8";
+            *flag = test_mm512_mask_loadu_epi8();
+            break;
+        case UT_MM512_MASKZ_LOADU_EPI8:
+            ret = "MM512_MASKZ_LOADU_EPI8";
+            *flag = test_mm512_maskz_loadu_epi8();
+            break;
+        case UT_MM512_ABS_EPI8:
+            ret = "MM512_ABS_EPI8";
+            *flag = test_mm512_abs_epi8();
+            break;
+        case UT_MM256_BROADCASTQ_EPI64:
+            ret = "MM256_BROADCASTQ_EPI64";
+            *flag = test_mm256_broadcastq_epi64();
+            break;
+        case UT_MM256_BROADCASTSI128_SI256:
+            ret = "MM256_BROADCASTSI128_SI256";
+            *flag = test_mm256_broadcastsi128_si256();
+            break;
+        case UT_MM512_BROADCAST_I32X4:
+            ret = "MM512_BROADCAST_I32X4";
+            *flag = test_mm512_broadcast_i32x4();
+            break;
+        case UT_MM512_BROADCAST_I64X4:
+            ret = "MM512_BROADCAST_I64X4";
+            *flag = test_mm512_broadcast_i64x4();
+            break;
+        case UT_MM512_MASK_BROADCAST_I64X4:
+            ret = "MM512_MASK_BROADCAST_I64X4";
+            *flag = test_mm512_mask_broadcast_i64x4();
+            break;
+        case UT_MM256_CASTPD128_PD256:
+            ret = "MM256_CASTPD128_PD256";
+            *flag = test_mm256_castpd128_pd256();
+            break;
+        case UT_MM256_CASTPD256_PD128:
+            ret = "MM256_CASTPD256_PD128";
+            *flag = test_mm256_castpd256_pd128();
+            break;
+        case UT_MM256_CASTPS128_PS256:
+            ret = "MM256_CASTPS128_PS256";
+            *flag = test_mm256_castps128_ps256();
+            break;
+        case UT_MM256_CASTPS256_PS128:
+            ret = "MM256_CASTPS256_PS128";
+            *flag = test_mm256_castps256_ps128();
+            break;
+        case UT_MM256_CASTSI128_SI256:
+            ret = "MM256_CASTSI128_SI256";
+            *flag = test_mm256_castsi128_si256();
+            break;
+        case UT_MM256_CASTSI256_PS:
+            ret = "MM256_CASTSI256_PS";
+            *flag = test_mm256_castsi256_ps();
+            break;
+        case UT_MM256_CASTSI256_SI128:
+            ret = "MM256_CASTSI256_SI128";
+            *flag = test_mm256_castsi256_si128();
+            break;
+        case UT_MM256_CVTEPI32_PD:
+            ret = "MM256_CVTEPI32_PD";
+            *flag = test_mm256_cvtepi32_pd();
+            break;
+        case UT_MM256_CVTEPI32_PS:
+            ret = "MM256_CVTEPI32_PS";
+            *flag = test_mm256_cvtepi32_ps();
+            break;
+        case UT_MM256_SHUFFLE_EPI8:
+            ret = "MM256_SHUFFLE_EPI8";
+            *flag = test_mm256_shuffle_epi8();
+            break;
+        case UT_MM512_SHUFFLE_EPI8:
+            ret = "MM512_SHUFFLE_EPI8";
+            *flag = test_mm512_shuffle_epi8();
+            break;
+        case UT_MM512_MASKZ_SHUFFLE_EPI8:
+            ret = "MM512_MASKZ_SHUFFLE_EPI8";
+            *flag = test_mm512_maskz_shuffle_epi8();
+            break;
+        case UT_MM256_MULTISHIFT_EPI64_EPI8:
+            ret = "MM256_MULTISHIFT_EPI64_EPI8";
+            *flag = test_mm256_multishift_epi64_epi8();
+            break;
+        case UT_MM512_MULTISHIFT_EPI64_EPI8:
+            ret = "MM512_MULTISHIFT_EPI64_EPI8";
+            *flag = test_mm512_multishift_epi64_epi8();
+            break;
+        case UT_MM256_ALIGNR_EPI8:
+            ret = "MM256_ALIGNR_EPI8";
+            *flag = test_mm256_alignr_epi8();
+            break;
+        case UT_MM_CMPESTRI:
+            ret = "MM_CMPESTRI";
+            *flag = test_mm_cmpestri();
+            break;
+        case UT_MM_CMPESTRM:
+            ret = "MM_CMPESTRM";
+            *flag = test_mm_cmpestrm();
+            break;
+        case UT_MM_INSERT_EPI32:
+            ret = "MM_INSERT_EPI32";
+            *flag = test_mm_insert_epi32();
+            break;
+        case UT_MM256_INSERT_EPI32:
+            ret = "MM256_INSERT_EPI32";
+            *flag = test_mm256_insert_epi32();
+            break;
+        case UT_MM256_INSERT_EPI64:
+            ret = "MM256_INSERT_EPI64";
+            *flag = test_mm256_insert_epi64();
+            break;
+        case UT_MM512_CASTPD128_PD512:
+            ret = "MM512_CASTPD128_PD512";
+            *flag = test_mm512_castpd128_pd512();
+            break;
+        case UT_MM512_CASTPD512_PD128:
+            ret = "MM512_CASTPD512_PD128";
+            *flag = test_mm512_castpd512_pd128();
+            break;
+        case UT_MM512_CASTPS128_PS512:
+            ret = "MM512_CASTPS128_PS512";
+            *flag = test_mm512_castps128_ps512();
+            break;
+        case UT_MM512_CASTPS512_PS128:
+            ret = "MM512_CASTPS512_PS128";
+            *flag = test_mm512_castps512_ps128();
+            break;
+        case UT_MM512_CVTEPI32_PD:
+            ret = "MM512_CVTEPI32_PD";
+            *flag = test_mm512_cvtepi32_pd();
+            break;
+        case UT_MM512_CVTEPI32_PS:
+            ret = "MM512_CVTEPI32_PS";
+            *flag = test_mm512_cvtepi32_ps();
+            break;
+        case UT_MM512_INSERTF32X8:
+            ret = "MM512_INSERTF32X8";
+            *flag = test_mm512_insertf32x8();
+            break;
+        case UT_MM512_INSERTF64X4:
+            ret = "MM512_INSERTF64X4";
+            *flag = test_mm512_insertf64x4();
+            break;
+        case UT_MM512_INSERTI32X8:
+            ret = "MM512_INSERTI32X8";
+            *flag = test_mm512_inserti32x8();
+            break;
+        case UT_MM512_INSERTI64X4:
+            ret = "MM512_INSERTI64X4";
+            *flag = test_mm512_inserti64x4();
+            break;
+        case UT_MM512_PERMUTEXVAR_EPI32:
+            ret = "MM512_PERMUTEXVAR_EPI32";
+            *flag = test_mm512_permutexvar_epi32();
+            break;
+        case UT_MM256_CMP_PD:
+            ret = "MM256_CMP_PD";
+            *flag = test_mm256_cmp_pd();
+            break;
+        case UT_MM256_CMP_PS:
+            ret = "MM256_CMP_PS";
+            *flag = test_mm256_cmp_ps();
+            break;
+        case UT_MM512_CMP_PD_MASK:
+            ret = "MM512_CMP_PD_MASK";
+            *flag = test_mm512_cmp_pd_mask();
+            break;
+        case UT_MM512_CMP_PS_MASK:
+            ret = "MM512_CMP_PS_MASK";
+            *flag = test_mm512_cmp_ps_mask();
+            break;
+        default:
+            break;
+    }
+    return ret;
+}
+
+int IsEqualFloat32x4(__m128 a, const float32_t *x, float epsilon)
+{
+    float e0 = fabs(vgetq_lane_f32(a, 0) - x[0]);
+    float e1 = fabs(vgetq_lane_f32(a, 1) - x[1]);
+    float e2 = fabs(vgetq_lane_f32(a, 2) - x[2]);
+    float e3 = fabs(vgetq_lane_f32(a, 3) - x[3]);
+    ASSERT_RETURN(e0 < epsilon);
+    ASSERT_RETURN(e1 < epsilon);
+    ASSERT_RETURN(e2 < epsilon);
+    ASSERT_RETURN(e3 < epsilon);
+    return TRUE;
+}
+int IsEqualFloat64x2(__m128d a, const float64_t *x, float epsilon)
+{
+    double e0 = fabs(vgetq_lane_f64(a, 0) - x[0]);
+    double e1 = fabs(vgetq_lane_f64(a, 1) - x[1]);
+    ASSERT_RETURN(e0 < epsilon);
+    ASSERT_RETURN(e1 < epsilon);
+    return TRUE;
+}
+
+int IsEqualFloat32x8(__m256 a, const float *x, float eps)
+{
+    __m128 tmp;
+    for (unsigned int i = 0; i < sizeof(__m256) / sizeof(__m128); i++) {
+        tmp = a.vect_f32[i];
+        ASSERT_RETURN(IsEqualFloat32x4(tmp, x + i * sizeof(__m128) / sizeof(float), eps));
+    }
+    return TRUE;
+}
+
+int IsEqualFloat64x4(__m256d a, const double *x, float eps)
+{
+    __m128d tmp;
+    for (unsigned int i = 0; i < sizeof(__m256d) / sizeof(__m128d); i++) {
+        tmp = a.vect_f64[i];
+        ASSERT_RETURN(IsEqualFloat64x2(tmp, x + i * sizeof(__m128d) / sizeof(double), eps));
+    }
+    return TRUE;
+}
+
+int IsEqualFloat32x16(__m512 a, const float *x, float eps)
+{
+    __m128 tmp;
+    for (unsigned int i = 0; i < sizeof(__m512) / sizeof(__m128); i++) {
+        tmp = a.vect_f32[i];
+        ASSERT_RETURN(IsEqualFloat32x4(tmp, x + i * sizeof(__m128) / sizeof(float), eps));
+    }
+    return TRUE;
+}
+
+int IsEqualFloat64x8(__m512d a, const double *x, float eps)
+{
+    __m128d tmp;
+    for (unsigned int i = 0; i < sizeof(__m512d) / sizeof(__m128d); i++) {
+        tmp = a.vect_f64[i];
+        ASSERT_RETURN(IsEqualFloat64x2(tmp, x + i * sizeof(__m128d) / sizeof(double), eps));
+    }
+    return TRUE;
+}
+
+int test_mm_popcnt_u32()
+{
+    unsigned int a = 1587;
+    int expect = 6;
+    int res = _mm_popcnt_u32(a);
+    return (expect == res);
+}
+int test_mm_popcnt_u64()
+{
+    unsigned __int64 a = 34359738516;
+    __int64 expect = 4;
+    __int64 res = _mm_popcnt_u64(a);
+    return (expect == res);
+}
+int test_mm256_div_epi8()
+{
+    int8_t *a = g_test_mm256_div_epi8_data.a;
+    int8_t *b = g_test_mm256_div_epi8_data.b;
+    int8_t *expect = g_test_mm256_div_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m256i res = _mm256_div_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm256_div_epi16()
+{
+    int16_t *a = g_test_mm256_div_epi16_data.a;
+    int16_t *b = g_test_mm256_div_epi16_data.b;
+    int16_t *expect = g_test_mm256_div_epi16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m256i res = _mm256_div_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm256_div_epi32()
+{
+    int32_t *a = g_test_mm256_div_epi32_data.a;
+    int32_t *b = g_test_mm256_div_epi32_data.b;
+    int32_t *expect = g_test_mm256_div_epi32_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m256i res = _mm256_div_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm256_div_epi64()
+{
+    int64_t *a = g_test_mm256_div_epi64_data.a;
+    int64_t *b = g_test_mm256_div_epi64_data.b;
+    int64_t *expect = g_test_mm256_div_epi64_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m256i res = _mm256_div_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm256_div_epu8()
+{
+    uint8_t *a = g_test_mm256_div_epu8_data.a;
+    uint8_t *b = g_test_mm256_div_epu8_data.b;
+    uint8_t *expect = g_test_mm256_div_epu8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_u8[iCount] = vld1q_u8(a + iCount * 16);
+        mb.vect_u8[iCount] = vld1q_u8(b + iCount * 16);
+    }
+    __m256i res = _mm256_div_epu8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm256_div_epu16()
+{
+    uint16_t *a = g_test_mm256_div_epu16_data.a;
+    uint16_t *b = g_test_mm256_div_epu16_data.b;
+    uint16_t *expect = g_test_mm256_div_epu16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_u16[iCount] = vld1q_u16(a + iCount * 8);
+        mb.vect_u16[iCount] = vld1q_u16(b + iCount * 8);
+    }
+    __m256i res = _mm256_div_epu16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm256_div_epu32()
+{
+    uint32_t *a = g_test_mm256_div_epu32_data.a;
+    uint32_t *b = g_test_mm256_div_epu32_data.b;
+    uint32_t *expect = g_test_mm256_div_epu32_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_u32[iCount] = vld1q_u32(a + iCount * 4);
+        mb.vect_u32[iCount] = vld1q_u32(b + iCount * 4);
+    }
+    __m256i res = _mm256_div_epu32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm256_div_epu64()
+{
+    uint64_t *a = g_test_mm256_div_epu64_data.a;
+    uint64_t *b = g_test_mm256_div_epu64_data.b;
+    uint64_t *expect = g_test_mm256_div_epu64_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_u64[iCount] = vld1q_u64(a + iCount * 2);
+        mb.vect_u64[iCount] = vld1q_u64(b + iCount * 2);
+    }
+    __m256i res = _mm256_div_epu64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm256_div_pd()
+{
+    float64_t *a = g_test_mm256_div_pd_data.a;
+    float64_t *b = g_test_mm256_div_pd_data.b;
+    float64_t *expect = g_test_mm256_div_pd_data.expect;
+    int iCount;
+    __m256d ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m256d res = _mm256_div_pd(ma, mb);
+
+    return IsEqualFloat64x4(res, expect, DEFAULT_EPSILON_F64);
+}
+int test_mm256_div_ps()
+{
+    float32_t *a = g_test_mm256_div_ps_data.a;
+    float32_t *b = g_test_mm256_div_ps_data.b;
+    float32_t *expect = g_test_mm256_div_ps_data.expect;
+    int iCount;
+    __m256 ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m256 res = _mm256_div_ps(ma, mb);
+
+    return IsEqualFloat32x8(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm512_div_epi8()
+{
+    int8_t *a = g_test_mm512_div_epi8_data.a;
+    int8_t *b = g_test_mm512_div_epi8_data.b;
+    int8_t *expect = g_test_mm512_div_epi8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m512i res = _mm512_div_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_div_epi16()
+{
+    int16_t *a = g_test_mm512_div_epi16_data.a;
+    int16_t *b = g_test_mm512_div_epi16_data.b;
+    int16_t *expect = g_test_mm512_div_epi16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m512i res = _mm512_div_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_div_epi32()
+{
+    int32_t *a = g_test_mm512_div_epi32_data.a;
+    int32_t *b = g_test_mm512_div_epi32_data.b;
+    int32_t *expect = g_test_mm512_div_epi32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_div_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_div_epi64()
+{
+    int64_t *a = g_test_mm512_div_epi64_data.a;
+    int64_t *b = g_test_mm512_div_epi64_data.b;
+    int64_t *expect = g_test_mm512_div_epi64_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m512i res = _mm512_div_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_div_epu8()
+{
+    uint8_t *a = g_test_mm512_div_epu8_data.a;
+    uint8_t *b = g_test_mm512_div_epu8_data.b;
+    uint8_t *expect = g_test_mm512_div_epu8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u8[iCount] = vld1q_u8(a + iCount * 16);
+        mb.vect_u8[iCount] = vld1q_u8(b + iCount * 16);
+    }
+    __m512i res = _mm512_div_epu8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_div_epu16()
+{
+    uint16_t *a = g_test_mm512_div_epu16_data.a;
+    uint16_t *b = g_test_mm512_div_epu16_data.b;
+    uint16_t *expect = g_test_mm512_div_epu16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u16[iCount] = vld1q_u16(a + iCount * 8);
+        mb.vect_u16[iCount] = vld1q_u16(b + iCount * 8);
+    }
+    __m512i res = _mm512_div_epu16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_div_epu32()
+{
+    uint32_t *a = g_test_mm512_div_epu32_data.a;
+    uint32_t *b = g_test_mm512_div_epu32_data.b;
+    uint32_t *expect = g_test_mm512_div_epu32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u32[iCount] = vld1q_u32(a + iCount * 4);
+        mb.vect_u32[iCount] = vld1q_u32(b + iCount * 4);
+    }
+    __m512i res = _mm512_div_epu32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_div_epu64()
+{
+    uint64_t *a = g_test_mm512_div_epu64_data.a;
+    uint64_t *b = g_test_mm512_div_epu64_data.b;
+    uint64_t *expect = g_test_mm512_div_epu64_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u64[iCount] = vld1q_u64(a + iCount * 2);
+        mb.vect_u64[iCount] = vld1q_u64(b + iCount * 2);
+    }
+    __m512i res = _mm512_div_epu64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_div_pd()
+{
+    float64_t *a = g_test_mm512_div_pd_data.a;
+    float64_t *b = g_test_mm512_div_pd_data.b;
+    float64_t *expect = g_test_mm512_div_pd_data.expect;
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_div_pd(ma, mb);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+int test_mm512_div_ps()
+{
+    float32_t *a = g_test_mm512_div_ps_data.a;
+    float32_t *b = g_test_mm512_div_ps_data.b;
+    float32_t *expect = g_test_mm512_div_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_div_ps(ma, mb);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_div_round_ps()
+{
+    float32_t *a = g_test_mm512_div_round_ps_data.a;
+    float32_t *b = g_test_mm512_div_round_ps_data.b;
+    int rounding = g_test_mm512_div_round_ps_data.rounding;
+    float32_t *expect = g_test_mm512_div_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_div_round_ps(ma, mb, rounding);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm512_div_round_pd()
+{
+    float64_t *a = g_test_mm512_div_pd_data.a;
+    float64_t *b = g_test_mm512_div_pd_data.b;
+    int rounding = g_test_mm512_div_round_pd_data.rounding;
+    float64_t *expect = g_test_mm512_div_pd_data.expect;
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_div_round_pd(ma, mb, rounding);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+int test_mm256_add_epi8()
+{
+    int8_t *a = g_test_mm256_add_epi8_data.a;
+    int8_t *b = g_test_mm256_add_epi8_data.b;
+    int8_t *expect = g_test_mm256_add_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m256i res = _mm256_add_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_add_epi16()
+{
+    int16_t *a = g_test_mm256_add_epi16_data.a;
+    int16_t *b = g_test_mm256_add_epi16_data.b;
+    int16_t *expect = g_test_mm256_add_epi16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m256i res = _mm256_add_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_add_epi32()
+{
+    int32_t *a = g_test_mm256_add_epi32_data.a;
+    int32_t *b = g_test_mm256_add_epi32_data.b;
+    int32_t *expect = g_test_mm256_add_epi32_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m256i res = _mm256_add_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_add_epi64()
+{
+    int64_t *a = g_test_mm256_add_epi64_data.a;
+    int64_t *b = g_test_mm256_add_epi64_data.b;
+    int64_t *expect = g_test_mm256_add_epi64_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m256i res = _mm256_add_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_add_epi8()
+{
+    int8_t *a = g_test_mm512_add_epi8_data.a;
+    int8_t *b = g_test_mm512_add_epi8_data.b;
+    int8_t *expect = g_test_mm512_add_epi8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m512i res = _mm512_add_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_add_epi16()
+{
+    int16_t *a = g_test_mm512_add_epi16_data.a;
+    int16_t *b = g_test_mm512_add_epi16_data.b;
+    int16_t *expect = g_test_mm512_add_epi16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m512i res = _mm512_add_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_add_epi32()
+{
+    int32_t *a = g_test_mm512_add_epi32_data.a;
+    int32_t *b = g_test_mm512_add_epi32_data.b;
+    int32_t *expect = g_test_mm512_add_epi32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_add_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_add_epi64()
+{
+    int64_t *a = g_test_mm512_add_epi64_data.a;
+    int64_t *b = g_test_mm512_add_epi64_data.b;
+    int64_t *expect = g_test_mm512_add_epi64_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m512i res = _mm512_add_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_adds_epi8()
+{
+    int8_t *a = g_test_mm256_adds_epi8_data.a;
+    int8_t *b = g_test_mm256_adds_epi8_data.b;
+    int8_t *expect = g_test_mm256_adds_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m256i res = _mm256_adds_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_adds_epi16()
+{
+    int16_t *a = g_test_mm256_adds_epi16_data.a;
+    int16_t *b = g_test_mm256_adds_epi16_data.b;
+    int16_t *expect = g_test_mm256_adds_epi16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m256i res = _mm256_adds_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_adds_epu8()
+{
+    uint8_t *a = g_test_mm256_adds_epu8_data.a;
+    uint8_t *b = g_test_mm256_adds_epu8_data.b;
+    uint8_t *expect = g_test_mm256_adds_epu8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_u8[iCount] = vld1q_u8(a + iCount * 16);
+        mb.vect_u8[iCount] = vld1q_u8(b + iCount * 16);
+    }
+    __m256i res = _mm256_adds_epu8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_adds_epu16()
+{
+    uint16_t *a = g_test_mm256_adds_epu16_data.a;
+    uint16_t *b = g_test_mm256_adds_epu16_data.b;
+    uint16_t *expect = g_test_mm256_adds_epu16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_u16[iCount] = vld1q_u16(a + iCount * 8);
+        mb.vect_u16[iCount] = vld1q_u16(b + iCount * 8);
+    }
+    __m256i res = _mm256_adds_epu16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_adds_epi8()
+{
+    int8_t *a = g_test_mm512_adds_epi8_data.a;
+    int8_t *b = g_test_mm512_adds_epi8_data.b;
+    int8_t *expect = g_test_mm512_adds_epi8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m512i res = _mm512_adds_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_adds_epi16()
+{
+    int16_t *a = g_test_mm512_adds_epi16_data.a;
+    int16_t *b = g_test_mm512_adds_epi16_data.b;
+    int16_t *expect = g_test_mm512_adds_epi16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m512i res = _mm512_adds_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_adds_epu8()
+{
+    uint8_t *a = g_test_mm512_adds_epu8_data.a;
+    uint8_t *b = g_test_mm512_adds_epu8_data.b;
+    uint8_t *expect = g_test_mm512_adds_epu8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u8[iCount] = vld1q_u8(a + iCount * 16);
+        mb.vect_u8[iCount] = vld1q_u8(b + iCount * 16);
+    }
+    __m512i res = _mm512_adds_epu8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_adds_epu16()
+{
+    uint16_t *a = g_test_mm512_adds_epu16_data.a;
+    uint16_t *b = g_test_mm512_adds_epu16_data.b;
+    uint16_t *expect = g_test_mm512_adds_epu16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u16[iCount] = vld1q_u16(a + iCount * 8);
+        mb.vect_u16[iCount] = vld1q_u16(b + iCount * 8);
+    }
+    __m512i res = _mm512_adds_epu16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_add_ps()
+{
+    float32_t *a = g_test_mm256_add_ps_data.a;
+    float32_t *b = g_test_mm256_add_ps_data.b;
+    float32_t *expect = g_test_mm256_add_ps_data.expect;
+    int iCount;
+    __m256 ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m256 res = _mm256_add_ps(ma, mb);
+
+    return IsEqualFloat32x8(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm256_add_pd()
+{
+    float64_t *a = g_test_mm256_add_pd_data.a;
+    float64_t *b = g_test_mm256_add_pd_data.b;
+    float64_t *expect = g_test_mm256_add_pd_data.expect;
+    int iCount;
+    __m256d ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m256d res = _mm256_add_pd(ma, mb);
+
+    return IsEqualFloat64x4(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_add_ps()
+{
+    float32_t *a = g_test_mm512_add_ps_data.a;
+    float32_t *b = g_test_mm512_add_ps_data.b;
+    float32_t *expect = g_test_mm512_add_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_add_ps(ma, mb);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_add_pd()
+{
+    float64_t *a = g_test_mm512_add_pd_data.a;
+    float64_t *b = g_test_mm512_add_pd_data.b;
+    float64_t *expect = g_test_mm512_add_pd_data.expect;
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_add_pd(ma, mb);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_add_round_ps()
+{
+    float32_t *a = g_test_mm512_add_round_ps_data.a;
+    float32_t *b = g_test_mm512_add_round_ps_data.b;
+    int rounding = g_test_mm512_add_round_ps_data.rounding;
+    float32_t *expect = g_test_mm512_add_round_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_add_round_ps(ma, mb, rounding);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_add_round_pd()
+{
+    float64_t *a = g_test_mm512_add_round_pd_data.a;
+    float64_t *b = g_test_mm512_add_round_pd_data.b;
+    int rounding = g_test_mm512_add_round_pd_data.rounding;
+    float64_t *expect = g_test_mm512_add_round_pd_data.expect;
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_add_round_pd(ma, mb, rounding);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_addn_ps()
+{
+    float32_t *a = g_test_mm512_addn_ps_data.a;
+    float32_t *b = g_test_mm512_addn_ps_data.b;
+    float32_t *expect = g_test_mm512_addn_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_addn_ps(ma, mb);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_addn_pd()
+{
+    float64_t *a = g_test_mm512_addn_pd_data.a;
+    float64_t *b = g_test_mm512_addn_pd_data.b;
+    float64_t *expect = g_test_mm512_addn_pd_data.expect;
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_addn_pd(ma, mb);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_addn_round_ps()
+{
+    float32_t *a = g_test_mm512_addn_round_ps_data.a;
+    float32_t *b = g_test_mm512_addn_round_ps_data.b;
+    int rounding = g_test_mm512_addn_round_ps_data.rounding;
+    float32_t *expect = g_test_mm512_addn_round_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_addn_round_ps(ma, mb, rounding);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_addn_round_pd()
+{
+    float64_t *a = g_test_mm512_addn_round_pd_data.a;
+    float64_t *b = g_test_mm512_addn_round_pd_data.b;
+    int rounding = g_test_mm512_addn_round_pd_data.rounding;
+    float64_t *expect = g_test_mm512_addn_round_pd_data.expect;
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_addn_round_pd(ma, mb, rounding);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_addsetc_epi32()
+{
+    int32_t *a = g_test_mm512_addsetc_epi32_data.a;
+    int32_t *b = g_test_mm512_addsetc_epi32_data.b;
+    int32_t *expect = g_test_mm512_addsetc_epi32_data.expect;
+    __mmask16 expect_sign = g_test_mm512_addsetc_epi32_data.sign;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __mmask16 sign;
+    __m512i res = _mm512_addsetc_epi32(ma, mb, &sign);
+    return comp_return(expect, &res, sizeof(__m512i)) && (expect_sign == sign);
+}
+
+int test_mm512_addsets_epi32()
+{
+    int32_t *a = g_test_mm512_addsets_epi32_data.a;
+    int32_t *b = g_test_mm512_addsets_epi32_data.b;
+    int32_t *expect = g_test_mm512_addsets_epi32_data.expect;
+    __mmask16 expect_sign = g_test_mm512_addsets_epi32_data.sign;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __mmask16 sign;
+    __m512i res = _mm512_addsets_epi32(ma, mb, &sign);
+    return comp_return(expect, &res, sizeof(__m512i)) && (expect_sign == sign);
+}
+
+int test_mm512_addsets_ps()
+{
+    float32_t *a = g_test_mm512_addsets_ps_data.a;
+    float32_t *b = g_test_mm512_addsets_ps_data.b;
+    float32_t *expect = g_test_mm512_addsets_ps_data.expect;
+    __mmask16 expect_sign = g_test_mm512_addsets_ps_data.sign;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __mmask16 sign;
+    __m512 res = _mm512_addsets_ps(ma, mb, &sign);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32) && (expect_sign == sign);
+}
+
+int test_mm512_addsets_round_ps()
+{
+    float32_t *a = g_test_mm512_addsets_round_ps_data.a;
+    float32_t *b = g_test_mm512_addsets_round_ps_data.b;
+    int rounding = g_test_mm512_addsets_round_ps_data.rounding;
+    float32_t *expect = g_test_mm512_addsets_round_ps_data.expect;
+    __mmask16 expect_sign = g_test_mm512_addsets_round_ps_data.sign;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __mmask16 sign;
+    __m512 res = _mm512_addsets_round_ps(ma, mb, &sign, rounding);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32) && (expect_sign == sign);
+}
+
+int test_mm256_addsub_ps()
+{
+    float32_t *a = g_test_mm256_addsub_ps_data.a;
+    float32_t *b = g_test_mm256_addsub_ps_data.b;
+    float32_t *expect = g_test_mm256_addsub_ps_data.expect;
+    int iCount;
+    __m256 ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m256 res = _mm256_addsub_ps(ma, mb);
+
+    return IsEqualFloat32x8(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm256_addsub_pd()
+{
+    float64_t *a = g_test_mm256_addsub_pd_data.a;
+    float64_t *b = g_test_mm256_addsub_pd_data.b;
+    float64_t *expect = g_test_mm256_addsub_pd_data.expect;
+    int iCount;
+    __m256d ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m256d res = _mm256_addsub_pd(ma, mb);
+
+    return IsEqualFloat64x4(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm256_sub_epi16()
+{
+    int16_t *a = g_test_mm256_sub_epi16_data.a;
+    int16_t *b = g_test_mm256_sub_epi16_data.b;
+    int16_t *expect = g_test_mm256_sub_epi16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m256i res = _mm256_sub_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_sub_epi32()
+{
+    int32_t *a = g_test_mm256_sub_epi32_data.a;
+    int32_t *b = g_test_mm256_sub_epi32_data.b;
+    int32_t *expect = g_test_mm256_sub_epi32_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m256i res = _mm256_sub_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_sub_epi64()
+{
+    int64_t *a = g_test_mm256_sub_epi64_data.a;
+    int64_t *b = g_test_mm256_sub_epi64_data.b;
+    int64_t *expect = g_test_mm256_sub_epi64_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m256i res = _mm256_sub_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_sub_epi8()
+{
+    int8_t *a = g_test_mm256_sub_epi8_data.a;
+    int8_t *b = g_test_mm256_sub_epi8_data.b;
+    int8_t *expect = g_test_mm256_sub_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m256i res = _mm256_sub_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_sub_pd()
+{
+    float64_t *a = g_test_mm256_sub_pd_data.a;
+    float64_t *b = g_test_mm256_sub_pd_data.b;
+    float64_t *expect = g_test_mm256_sub_pd_data.expect;
+    int iCount;
+    __m256d ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m256d res = _mm256_sub_pd(ma, mb);
+
+    return IsEqualFloat64x4(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm256_sub_ps()
+{
+    float32_t *a = g_test_mm256_sub_ps_data.a;
+    float32_t *b = g_test_mm256_sub_ps_data.b;
+    float32_t *expect = g_test_mm256_sub_ps_data.expect;
+    int iCount;
+    __m256 ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m256 res = _mm256_sub_ps(ma, mb);
+
+    return IsEqualFloat32x8(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_sub_epi16()
+{
+    int16_t *a = g_test_mm512_sub_epi16_data.a;
+    int16_t *b = g_test_mm512_sub_epi16_data.b;
+    int16_t *expect = g_test_mm512_sub_epi16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m512i res = _mm512_sub_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_sub_epi32()
+{
+    int32_t *a = g_test_mm512_sub_epi32_data.a;
+    int32_t *b = g_test_mm512_sub_epi32_data.b;
+    int32_t *expect = g_test_mm512_sub_epi32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_sub_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_sub_epi64()
+{
+    int64_t *a = g_test_mm512_sub_epi64_data.a;
+    int64_t *b = g_test_mm512_sub_epi64_data.b;
+    int64_t *expect = g_test_mm512_sub_epi64_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m512i res = _mm512_sub_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_sub_epi8()
+{
+    int8_t *a = g_test_mm512_sub_epi8_data.a;
+    int8_t *b = g_test_mm512_sub_epi8_data.b;
+    int8_t *expect = g_test_mm512_sub_epi8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m512i res = _mm512_sub_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_sub_pd()
+{
+    float64_t *a = g_test_mm512_sub_pd_data.a;
+    float64_t *b = g_test_mm512_sub_pd_data.b;
+    float64_t *expect = g_test_mm512_sub_pd_data.expect;
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_sub_pd(ma, mb);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_bslli_epi128()
+{
+    int64_t *a = g_test_mm512_bslli_epi128_data.a;
+    int64_t *expect = g_test_mm512_bslli_epi128_data.expect;
+    int iCount;
+    __m512i ma;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    __m512i res = _mm512_bslli_epi128(ma, 8);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_bsrli_epi128()
+{
+    int64_t *a = g_test_mm512_bsrli_epi128_data.a;
+    int64_t *expect = g_test_mm512_bsrli_epi128_data.expect;
+    int iCount;
+    __m512i ma;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    __m512i res = _mm512_bsrli_epi128(ma, 4);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm_sll_epi64()
+{
+    int64_t a[2] = {1, -1};
+    int64_t b[2] = {2, 2};
+    int64_t expect[2] = {4, -4};
+    __m128i ma, mb;
+    ma.vect_s64 = vld1q_s64(a);
+    mb.vect_s64 = vld1q_s64(b);
+    __m128i res = _mm_sll_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m128i));
+}
+
+int test_mm256_sll_epi32()
+{
+    int32_t *a = g_test_mm256_sll_epi32_data.a;
+    int64_t *b = g_test_mm256_sll_epi32_data.b;
+    int32_t *expect = g_test_mm256_sll_epi32_data.expect;
+    int iCount;
+    __m256i ma;
+    __m128i mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+    }
+    mb.vect_s64 = vld1q_s64(b);
+    __m256i res = _mm256_sll_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm256_sll_epi64()
+{
+    int64_t *a = g_test_mm256_sll_epi64_data.a;
+    int64_t *b = g_test_mm256_sll_epi64_data.b;
+    int64_t *expect = g_test_mm256_sll_epi64_data.expect;
+    int iCount;
+    __m256i ma;
+    __m128i mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    mb.vect_s64 = vld1q_s64(b);
+    __m256i res = _mm256_sll_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm512_sll_epi64()
+{
+    int64_t *a = g_test_mm512_sll_epi64_data.a;
+    int64_t *b = g_test_mm512_sll_epi64_data.b;
+    int64_t *expect = g_test_mm512_sll_epi64_data.expect;
+    int iCount;
+    __m512i ma;
+    __m128i mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    mb.vect_s64 = vld1q_s64(b);
+    __m512i res = _mm512_sll_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_slli_epi32()
+{
+    int32_t *a = g_test_mm256_slli_epi32_data.a;
+    int b = g_test_mm256_slli_epi32_data.b;
+    int32_t *expect = g_test_mm256_slli_epi32_data.expect;
+    int iCount;
+    __m256i ma;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+    }
+    __m256i res = _mm256_slli_epi32(ma, b);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_slli_epi64()
+{
+    int64_t *a = g_test_mm256_slli_epi64_data.a;
+    int b = g_test_mm256_slli_epi64_data.b;
+    int64_t *expect = g_test_mm256_slli_epi64_data.expect;
+    int iCount;
+    __m256i ma;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    __m256i res = _mm256_slli_epi64(ma, b);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm512_slli_epi64()
+{
+    int64_t *a = g_test_mm512_slli_epi64_data.a;
+    unsigned int b = g_test_mm512_slli_epi64_data.b;
+    int64_t *expect = g_test_mm512_slli_epi64_data.expect;
+    int iCount;
+    __m512i ma;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    __m512i res = _mm512_slli_epi64(ma, b);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm256_srli_epi64()
+{
+    int64_t *a = g_test_mm256_srli_epi64_data.a;
+    int b = g_test_mm256_srli_epi64_data.b;
+    uint64_t *expect = g_test_mm256_srli_epi64_data.expect;
+    int iCount;
+    __m256i ma;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    __m256i res = _mm256_srli_epi64(ma, b);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm512_srli_epi64()
+{
+    int64_t *a = g_test_mm512_srli_epi64_data.a;
+    unsigned int b = g_test_mm512_srli_epi64_data.b;
+    uint64_t *expect = g_test_mm512_srli_epi64_data.expect;
+    int iCount;
+    __m512i ma;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    __m512i res = _mm512_srli_epi64(ma, b);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_slli_si256()
+{
+    int64_t *a = g_test_mm256_slli_si256_data.a;
+    uint64_t *expect = g_test_mm256_slli_si256_data.expect;
+    int iCount;
+    __m256i ma;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    __m256i res = _mm256_slli_si256(ma, 2);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_srli_si256()
+{
+    int64_t *a = g_test_mm256_srli_si256_data.a;
+    uint64_t *expect = g_test_mm256_srli_si256_data.expect;
+    int iCount;
+    __m256i ma;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    __m256i res = _mm256_srli_si256(ma, 2);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_blendv_ps()
+{
+    float32_t *arr_a = g_test_mm256_blendv_ps_data.a;
+    float32_t *arr_b = g_test_mm256_blendv_ps_data.b;
+    float32_t *arr_m = g_test_mm256_blendv_ps_data.m;
+    float32_t *expect = g_test_mm256_blendv_ps_data.expect;
+    __m256 a, b, mask, res;
+    int iCount;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        a.vect_f32[iCount] = vld1q_f32(arr_a + iCount * 4);
+        b.vect_f32[iCount] = vld1q_f32(arr_b + iCount * 4);
+        mask.vect_f32[iCount] = vld1q_f32(arr_m + iCount * 4);
+    }
+
+    res = _mm256_blendv_ps(a, b, mask);
+
+    return IsEqualFloat32x8(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm256_blendv_pd()
+{
+    float64_t *arr_a = g_test_mm256_blendv_pd_data.a;
+    float64_t *arr_b = g_test_mm256_blendv_pd_data.b;
+    float64_t *arr_m = g_test_mm256_blendv_pd_data.m;
+    float64_t *expect = g_test_mm256_blendv_pd_data.expect;
+    __m256d a, b, mask, res;
+    int iCount;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        a.vect_f64[iCount] = vld1q_f64(arr_a + iCount * 2);
+        b.vect_f64[iCount] = vld1q_f64(arr_b + iCount * 2);
+        mask.vect_f64[iCount] = vld1q_f64(arr_m + iCount * 2);
+    }
+
+    res = _mm256_blendv_pd(a, b, mask);
+
+    return IsEqualFloat64x4(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm256_blend_ps()
+{
+    float32_t *arr_a = g_test_mm256_blend_ps_data.a;
+    float32_t *arr_b = g_test_mm256_blend_ps_data.b;
+    int imm = g_test_mm256_blend_ps_data.imm;
+    float32_t *expect = g_test_mm256_blend_ps_data.expect;
+    __m256 a, b, res;
+    int iCount;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        a.vect_f32[iCount] = vld1q_f32(arr_a + iCount * 4);
+        b.vect_f32[iCount] = vld1q_f32(arr_b + iCount * 4);
+    }
+
+    res = _mm256_blend_ps(a, b, imm);
+
+    return IsEqualFloat32x8(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm256_blend_pd()
+{
+    float64_t *arr_a = g_test_mm256_blend_pd_data.a;
+    float64_t *arr_b = g_test_mm256_blend_pd_data.b;
+    int imm = g_test_mm256_blend_pd_data.imm;
+    float64_t *expect = g_test_mm256_blend_pd_data.expect;
+    __m256d a, b, res;
+    int iCount;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        a.vect_f64[iCount] = vld1q_f64(arr_a + iCount * 2);
+        b.vect_f64[iCount] = vld1q_f64(arr_b + iCount * 2);
+    }
+
+    res = _mm256_blend_pd(a, b, imm);
+
+    return IsEqualFloat64x4(res, expect, DEFAULT_EPSILON_F64);
+}
+int test_mm512_mask_blend_ps()
+{
+    float32_t *arr_a = g_test_mm512_mask_blend_ps_data.a;
+    float32_t *arr_b = g_test_mm512_mask_blend_ps_data.b;
+    __mmask16 k = g_test_mm512_mask_blend_ps_data.k;
+    float32_t *expect = g_test_mm512_mask_blend_ps_data.expect;
+    __m512 a, b, res;
+    int iCount;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        a.vect_f32[iCount] = vld1q_f32(arr_a + iCount * 4);
+        b.vect_f32[iCount] = vld1q_f32(arr_b + iCount * 4);
+    }
+
+    res = _mm512_mask_blend_ps(k, a, b);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm512_mask_blend_pd()
+{
+    float64_t *arr_a = g_test_mm512_mask_blend_pd_data.a;
+    float64_t *arr_b = g_test_mm512_mask_blend_pd_data.b;
+    __mmask8 k = g_test_mm512_mask_blend_pd_data.k;
+    float64_t *expect = g_test_mm512_mask_blend_pd_data.expect;
+    __m512d a, b, res;
+    int iCount;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        a.vect_f64[iCount] = vld1q_f64(arr_a + iCount * 2);
+        b.vect_f64[iCount] = vld1q_f64(arr_b + iCount * 2);
+    }
+
+    res = _mm512_mask_blend_pd(k, a, b);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_sub_ps()
+{
+    float32_t *a = g_test_mm512_sub_ps_data.a;
+    float32_t *b = g_test_mm512_sub_ps_data.b;
+    float32_t *expect = g_test_mm512_sub_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_sub_ps(ma, mb);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm256_subs_epi16()
+{
+    int16_t *a = g_test_mm256_subs_epi16_data.a;
+    int16_t *b = g_test_mm256_subs_epi16_data.b;
+    int16_t *expect = g_test_mm256_subs_epi16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m256i res = _mm256_subs_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_subs_epi8()
+{
+    int8_t *a = g_test_mm256_subs_epi8_data.a;
+    int8_t *b = g_test_mm256_subs_epi8_data.b;
+    int8_t *expect = g_test_mm256_subs_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m256i res = _mm256_subs_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm256_subs_epu16()
+{
+    uint16_t *a = g_test_mm256_subs_epu16_data.a;
+    uint16_t *b = g_test_mm256_subs_epu16_data.b;
+    uint16_t *expect = g_test_mm256_subs_epu16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_u16[iCount] = vld1q_u16(a + iCount * 8);
+        mb.vect_u16[iCount] = vld1q_u16(b + iCount * 8);
+    }
+    __m256i res = _mm256_subs_epu16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_subs_epu8()
+{
+    uint8_t *a = g_test_mm256_subs_epu8_data.a;
+    uint8_t *b = g_test_mm256_subs_epu8_data.b;
+    uint8_t *expect = g_test_mm256_subs_epu8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_u8[iCount] = vld1q_u8(a + iCount * 16);
+        mb.vect_u8[iCount] = vld1q_u8(b + iCount * 16);
+    }
+    __m256i res = _mm256_subs_epu8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_subs_epi16()
+{
+    int16_t *a = g_test_mm512_subs_epi16_data.a;
+    int16_t *b = g_test_mm512_subs_epi16_data.b;
+    int16_t *expect = g_test_mm512_subs_epi16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m512i res = _mm512_subs_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_subs_epi8()
+{
+    int8_t *a = g_test_mm512_subs_epi8_data.a;
+    int8_t *b = g_test_mm512_subs_epi8_data.b;
+    int8_t *expect = g_test_mm512_subs_epi8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m512i res = _mm512_subs_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_subs_epu16()
+{
+    uint16_t *a = g_test_mm512_subs_epu16_data.a;
+    uint16_t *b = g_test_mm512_subs_epu16_data.b;
+    uint16_t *expect = g_test_mm512_subs_epu16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u16[iCount] = vld1q_u16(a + iCount * 8);
+        mb.vect_u16[iCount] = vld1q_u16(b + iCount * 8);
+    }
+    __m512i res = _mm512_subs_epu16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_subs_epu8()
+{
+    uint8_t *a = g_test_mm512_subs_epu8_data.a;
+    uint8_t *b = g_test_mm512_subs_epu8_data.b;
+    uint8_t *expect = g_test_mm512_subs_epu8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u8[iCount] = vld1q_u8(a + iCount * 16);
+        mb.vect_u8[iCount] = vld1q_u8(b + iCount * 16);
+    }
+    __m512i res = _mm512_subs_epu8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_sub_round_pd()
+{
+    float64_t *a = g_test_mm512_sub_round_pd_data.a;
+    float64_t *b = g_test_mm512_sub_round_pd_data.b;
+    float64_t *expect = g_test_mm512_sub_round_pd_data.expect;
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_sub_round_pd(ma, mb, _MM_FROUND_NO_EXC);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_sub_round_ps()
+{
+    float32_t *a = g_test_mm512_sub_round_ps_data.a;
+    float32_t *b = g_test_mm512_sub_round_ps_data.b;
+    float32_t *expect = g_test_mm512_sub_round_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_sub_round_ps(ma, mb, _MM_FROUND_NO_EXC);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_subr_epi32()
+{
+    int32_t *a = g_test_mm512_subr_epi32_data.a;
+    int32_t *b = g_test_mm512_subr_epi32_data.b;
+    int32_t *expect = g_test_mm512_subr_epi32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_subr_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_subr_ps()
+{
+    float32_t *a = g_test_mm512_subr_ps_data.a;
+    float32_t *b = g_test_mm512_subr_ps_data.b;
+    float32_t *expect = g_test_mm512_subr_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_subr_ps(ma, mb);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_subr_pd()
+{
+    float64_t *a = g_test_mm512_subr_pd_data.a;
+    float64_t *b = g_test_mm512_subr_pd_data.b;
+    float64_t *expect = g_test_mm512_subr_pd_data.expect;
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_subr_pd(ma, mb);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_subr_round_ps()
+{
+    float32_t *a = g_test_mm512_subr_round_ps_data.a;
+    float32_t *b = g_test_mm512_subr_round_ps_data.b;
+    float32_t *expect = g_test_mm512_subr_round_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_subr_round_ps(ma, mb, _MM_FROUND_NO_EXC);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm512_subr_round_pd()
+{
+    float64_t *a = g_test_mm512_subr_round_pd_data.a;
+    float64_t *b = g_test_mm512_subr_round_pd_data.b;
+    float64_t *expect = g_test_mm512_subr_round_pd_data.expect;
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_subr_round_pd(ma, mb, _MM_FROUND_NO_EXC);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_subsetb_epi32()
+{
+    int32_t *a = g_test_mm512_subsetb_epi32_data.a;
+    int32_t *b = g_test_mm512_subsetb_epi32_data.b;
+    int32_t *expect = g_test_mm512_subsetb_epi32_data.expect;
+    __mmask16 expect_borrow = g_test_mm512_subsetb_epi32_data.borrow;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __mmask16 borrow;
+    __m512i res = _mm512_subsetb_epi32(ma, mb, &borrow);
+    return comp_return(expect, &res, sizeof(__m512i)) && (expect_borrow == borrow);
+}
+int test_mm512_subrsetb_epi32()
+{
+    int32_t *a = g_test_mm512_subrsetb_epi32_data.a;
+    int32_t *b = g_test_mm512_subrsetb_epi32_data.b;
+    int32_t *expect = g_test_mm512_subrsetb_epi32_data.expect;
+    __mmask16 expect_borrow = g_test_mm512_subrsetb_epi32_data.borrow;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __mmask16 borrow;
+    __m512i res = _mm512_subrsetb_epi32(ma, mb, &borrow);
+    return comp_return(expect, &res, sizeof(__m512i)) && (expect_borrow == borrow);
+}
+
+int test_mm256_zeroupper()
+{
+    return 1;
+}
+
+int test_mm512_permutexvar_epi64()
+{
+    int64_t *a = g_test_mm512_permutexvar_epi64_data.a;
+    int64_t *b = g_test_mm512_permutexvar_epi64_data.b;
+    int64_t *expect = g_test_mm512_permutexvar_epi64_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    } 
+    __m512i res = _mm512_permutexvar_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_extracti32x4_epi32()
+{
+    int32_t *a = g_test_mm512_extracti32x4_epi32_data.a;
+    const int imm8 = g_test_mm512_extracti32x4_epi32_data.imm8;
+    int32_t *expect = g_test_mm512_extracti32x4_epi32_data.expect;
+    int iCount;
+    __m512i ma;
+    for (iCount = 0; iCount < 4; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+    }
+    __m128i res = _mm512_extracti32x4_epi32(ma, imm8);
+    return comp_return(expect, &res, sizeof(__m128i));
+}
+
+int test_mm512_test_epi8_mask()
+{
+    int8_t *a = g_test_mm512_test_epi8_mask_data.a;
+    int8_t *b = g_test_mm512_test_epi8_mask_data.b;
+    __mmask64 expect = g_test_mm512_test_epi8_mask_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < 4; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __mmask64 res = _mm512_test_epi8_mask(ma, mb);
+    return (res == expect);
+}
+
+int test_mm512_test_epi32_mask()
+{
+    int32_t *a = g_test_mm512_test_epi32_mask_data.a;
+    int32_t *b = g_test_mm512_test_epi32_mask_data.b;
+    __mmask16 expect = g_test_mm512_test_epi32_mask_data.expect;
+
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < 4; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __mmask16 res = _mm512_test_epi32_mask(ma, mb);
+    return (res == expect);
+}
+
+int test_mm512_test_epi64_mask()
+{
+    int64_t *a = g_test_mm512_test_epi64_mask_data.a;
+    int64_t *b = g_test_mm512_test_epi64_mask_data.b;
+    __mmask8 expect = g_test_mm512_test_epi64_mask_data.expect;
+
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < 4; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __mmask8 res = _mm512_test_epi64_mask(ma, mb);
+    return (res == expect);
+}
+
+int test_mm256_mul_epi32()
+{
+    int32_t *a = g_test_mm256_mul_epi32_data.a;
+    int32_t *b = g_test_mm256_mul_epi32_data.b;
+    int32_t *expect = g_test_mm256_mul_epi32_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < 2; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m256i res = _mm256_mul_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_mul_epu32()
+{
+    uint32_t *a = g_test_mm256_mul_epu32_data.a;
+    uint32_t *b = g_test_mm256_mul_epu32_data.b;
+    uint32_t *expect = g_test_mm256_mul_epu32_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < 2; iCount++) {
+        ma.vect_u32[iCount] = vld1q_u32(a + iCount * 4);
+        mb.vect_u32[iCount] = vld1q_u32(b + iCount * 4);
+    }
+    __m256i res = _mm256_mul_epu32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_mul_pd()
+{
+    float64_t *a = g_test_mm256_mul_pd_data.a;
+    float64_t *b = g_test_mm256_mul_pd_data.b;
+    float64_t *expect = g_test_mm256_mul_pd_data.expect;
+    int iCount;
+    __m256d ma, mb;
+    for (iCount = 0; iCount < 2; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m256d res = _mm256_mul_pd(ma, mb);
+
+    return IsEqualFloat64x4(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm256_mul_ps()
+{
+    float32_t *a = g_test_mm256_mul_ps_data.a;
+    float32_t *b = g_test_mm256_mul_ps_data.b;
+    float32_t *expect = g_test_mm256_mul_ps_data.expect;
+    int iCount;
+    __m256 ma, mb;
+    for (iCount = 0; iCount < 2; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m256 res = _mm256_mul_ps(ma, mb);
+
+    return IsEqualFloat32x8(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_mul_epi32()
+{
+    int32_t *a = g_test_mm512_mul_epi32_data.a;
+    int32_t *b = g_test_mm512_mul_epi32_data.b;
+    int32_t *expect = g_test_mm512_mul_epi32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < 4; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_mul_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m128i));
+}
+
+int test_mm512_mul_epu32()
+{
+    uint32_t *a = g_test_mm512_mul_epu32_data.a;
+    uint32_t *b = g_test_mm512_mul_epu32_data.b;
+    uint32_t *expect = g_test_mm512_mul_epu32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < 4; iCount++) {
+        ma.vect_u32[iCount] = vld1q_u32(a + iCount * 4);
+        mb.vect_u32[iCount] = vld1q_u32(b + iCount * 4);
+    }
+    __m512i res = _mm512_mul_epu32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m128i));
+}
+
+int test_mm512_mul_pd()
+{
+    float64_t *a = g_test_mm512_mul_pd_data.a;
+    float64_t *b = g_test_mm512_mul_pd_data.b;
+    float64_t *expect = g_test_mm512_mul_pd_data.expect; 
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < 4; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_mul_pd(ma, mb);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_mul_ps()
+{
+    float32_t *a = g_test_mm512_mul_ps_data.a;
+    float32_t *b = g_test_mm512_mul_ps_data.b;
+    float32_t *expect = g_test_mm512_mul_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < 4; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_mul_ps(ma, mb);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm256_mulhi_epi16()
+{
+    int16_t *a = g_test_mm256_mulhi_epi16_data.a;
+    int16_t *b = g_test_mm256_mulhi_epi16_data.b;
+    int16_t *expect = g_test_mm256_mulhi_epi16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m256i res = _mm256_mulhi_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_mulhi_epu16()
+{
+    uint16_t *a = g_test_mm256_mulhi_epu16_data.a;
+    uint16_t *b = g_test_mm256_mulhi_epu16_data.b;
+    uint16_t *expect = g_test_mm256_mulhi_epu16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_u16[iCount] = vld1q_u16(a + iCount * 8);
+        mb.vect_u16[iCount] = vld1q_u16(b + iCount * 8);
+    }
+    __m256i res = _mm256_mulhi_epu16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_mulhi_epi16()
+{
+    int16_t *a = g_test_mm512_mulhi_epi16_data.a;
+    int16_t *b = g_test_mm512_mulhi_epi16_data.b;
+    int16_t *expect = g_test_mm512_mulhi_epi16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m512i res = _mm512_mulhi_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_mulhi_epu16()
+{
+    uint16_t *a = g_test_mm512_mulhi_epu16_data.a;
+    uint16_t *b = g_test_mm512_mulhi_epu16_data.b;
+    uint16_t *expect = g_test_mm512_mulhi_epu16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u16[iCount] = vld1q_u16(a + iCount * 8);
+        mb.vect_u16[iCount] = vld1q_u16(b + iCount * 8);
+    }
+    __m512i res = _mm512_mulhi_epu16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_mulhi_epi32()
+{
+    int32_t *a = g_test_mm512_mulhi_epi32_data.a;
+    int32_t *b = g_test_mm512_mulhi_epi32_data.b;
+    int32_t *expect = g_test_mm512_mulhi_epi32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_mulhi_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_mulhi_epu32()
+{
+    uint32_t *a = g_test_mm512_mulhi_epu32_data.a;
+    uint32_t *b = g_test_mm512_mulhi_epu32_data.b;
+    uint32_t *expect = g_test_mm512_mulhi_epu32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u32[iCount] = vld1q_u32(a + iCount * 4);
+        mb.vect_u32[iCount] = vld1q_u32(b + iCount * 4);
+    }
+    __m512i res = _mm512_mulhi_epu32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_mullo_epi16()
+{
+    int16_t *a = g_test_mm256_mullo_epi16_data.a;
+    int16_t *b = g_test_mm256_mullo_epi16_data.b;
+    int16_t *expect = g_test_mm256_mullo_epi16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m256i res = _mm256_mullo_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_mullo_epi32()
+{
+    int32_t *a = g_test_mm256_mullo_epi32_data.a;
+    int32_t *b = g_test_mm256_mullo_epi32_data.b;
+    int32_t *expect = g_test_mm256_mullo_epi32_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m256i res = _mm256_mullo_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_mullo_epi64()
+{
+    int64_t *a = g_test_mm256_mullo_epi64_data.a;
+    int64_t *b = g_test_mm256_mullo_epi64_data.b;
+    int64_t *expect = g_test_mm256_mullo_epi64_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m256i res = _mm256_mullo_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_mullo_epi16()
+{
+    int16_t *a = g_test_mm512_mullo_epi16_data.a;
+    int16_t *b = g_test_mm512_mullo_epi16_data.b;
+    int16_t *expect = g_test_mm512_mullo_epi16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m512i res = _mm512_mullo_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_mullo_epi32()
+{
+    int32_t *a = g_test_mm512_mullo_epi32_data.a;
+    int32_t *b = g_test_mm512_mullo_epi32_data.b;
+    int32_t *expect = g_test_mm512_mullo_epi32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_mullo_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_mullo_epi64()
+{
+    int64_t *a = g_test_mm512_mullo_epi64_data.a;
+    int64_t *b = g_test_mm512_mullo_epi64_data.b;
+    int64_t *expect = g_test_mm512_mullo_epi64_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m512i res = _mm512_mullo_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_mullox_epi64()
+{
+    int64_t *a = g_test_mm512_mullox_epi64_data.a;
+    int64_t *b = g_test_mm512_mullox_epi64_data.b;
+    int64_t *expect = g_test_mm512_mullox_epi64_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m512i res = _mm512_mullox_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_mulhrs_epi16()
+{
+    int16_t *a = g_test_mm256_mulhrs_epi16_data.a;
+    int16_t *b = g_test_mm256_mulhrs_epi16_data.b;
+    int16_t *expect = g_test_mm256_mulhrs_epi16_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m256i res = _mm256_mulhrs_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_mulhrs_epi16()
+{
+    int16_t *a = g_test_mm512_mulhrs_epi16_data.a;
+    int16_t *b = g_test_mm512_mulhrs_epi16_data.b;
+    int16_t *expect = g_test_mm512_mulhrs_epi16_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s16[iCount] = vld1q_s16(a + iCount * 8);
+        mb.vect_s16[iCount] = vld1q_s16(b + iCount * 8);
+    }
+    __m512i res = _mm512_mulhrs_epi16(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_mul_round_pd()
+{
+    float64_t *a = g_test_mm512_mul_round_pd_data.a;
+    float64_t *b = g_test_mm512_mul_round_pd_data.b;
+    float64_t *expect = g_test_mm512_mul_round_pd_data.expect; 
+    int iCount;
+    __m512d ma, mb;
+    for (iCount = 0; iCount < 4; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+        mb.vect_f64[iCount] = vld1q_f64(b + iCount * 2);
+    }
+    __m512d res = _mm512_mul_round_pd(ma, mb, _MM_FROUND_NO_EXC);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_mul_round_ps()
+{
+    float32_t *a = g_test_mm512_mul_round_ps_data.a;
+    float32_t *b = g_test_mm512_mul_round_ps_data.b;
+    float32_t *expect = g_test_mm512_mul_round_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < 4; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+        mb.vect_f32[iCount] = vld1q_f32(b + iCount * 4);
+    }
+    __m512 res = _mm512_mul_round_ps(ma, mb, _MM_FROUND_NO_EXC);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm256_and_si256()
+{
+    int32_t *a = g_test_mm256_and_si256_data.a;
+    int32_t *b = g_test_mm256_and_si256_data.b;
+    int32_t *expect = g_test_mm256_and_si256_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m256i res = _mm256_and_si256(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm512_and_si512()
+{
+    int32_t *a = g_test_mm512_and_si512_data.a;
+    int32_t *b = g_test_mm512_and_si512_data.b;
+    int32_t *expect = g_test_mm512_and_si512_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_and_si512(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm256_or_si256()
+{
+    int32_t *a = g_test_mm256_or_si256_data.a;
+    int32_t *b = g_test_mm256_or_si256_data.b;
+    int32_t *expect = g_test_mm256_or_si256_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m256i res = _mm256_or_si256(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm512_or_si512()
+{
+    int32_t *a = g_test_mm512_or_si512_data.a;
+    int32_t *b = g_test_mm512_or_si512_data.b;
+    int32_t *expect = g_test_mm512_or_si512_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_or_si512(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm256_andnot_si256()
+{
+    int32_t *a = g_test_mm256_andnot_si256_data.a;
+    int32_t *b = g_test_mm256_andnot_si256_data.b;
+    int32_t *expect = g_test_mm256_andnot_si256_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m256i res = _mm256_andnot_si256(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm512_andnot_si512()
+{
+    int32_t *a = g_test_mm512_andnot_si512_data.a;
+    int32_t *b = g_test_mm512_andnot_si512_data.b;
+    int32_t *expect = g_test_mm512_andnot_si512_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_andnot_si512(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm256_xor_si256()
+{
+    int32_t *a = g_test_mm256_xor_si256_data.a;
+    int32_t *b = g_test_mm256_xor_si256_data.b;
+    int32_t *expect = g_test_mm256_xor_si256_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m256i res = _mm256_xor_si256(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+int test_mm512_xor_si512()
+{
+    int32_t *a = g_test_mm512_xor_si512_data.a;
+    int32_t *b = g_test_mm512_xor_si512_data.b;
+    int32_t *expect = g_test_mm512_xor_si512_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_xor_si512(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_or_ps()
+{
+    uint32_t *a = g_test_mm256_or_ps_data.a;
+    uint32_t *b = g_test_mm256_or_ps_data.b;
+    uint32_t *expect = g_test_mm256_or_ps_data.expect;
+    int iCount;
+    __m256 ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vreinterpretq_f32_u32(vld1q_u32(a + iCount * 4));
+        mb.vect_f32[iCount] = vreinterpretq_f32_u32(vld1q_u32(b + iCount * 4));
+    }
+    __m256 res = _mm256_or_ps(ma, mb);
+    
+    return comp_return(expect, &res, sizeof(__m256));
+}
+
+int test_mm256_or_pd()
+{
+    uint64_t *a = g_test_mm256_or_pd_data.a;
+    uint64_t *b = g_test_mm256_or_pd_data.b;
+    uint64_t *expect = g_test_mm256_or_pd_data.expect;
+    int iCount;
+    union {
+        __m256d f;
+        __m256i i;
+    }ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.i.vect_u64[iCount] = vld1q_u64(a + iCount * 2);
+        mb.i.vect_u64[iCount] = vld1q_u64(b + iCount * 2);
+    }
+    __m256d res = _mm256_or_pd(ma.f, mb.f);
+
+    return comp_return(expect, &res, sizeof(__m256d));
+}
+
+int test_mm512_and_epi32()
+{
+    int32_t *a = g_test_mm512_and_epi32_data.a;
+    int32_t *b = g_test_mm512_and_epi32_data.b;
+    int32_t *expect = g_test_mm512_and_epi32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_and_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_and_epi64()
+{
+    int64_t *a = g_test_mm512_and_epi64_data.a;
+    int64_t *b = g_test_mm512_and_epi64_data.b;
+    int64_t *expect = g_test_mm512_and_epi64_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m512i res = _mm512_and_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_or_epi32()
+{
+    int32_t *a = g_test_mm512_or_epi32_data.a;
+    int32_t *b = g_test_mm512_or_epi32_data.b;
+    int32_t *expect = g_test_mm512_or_epi32_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m512i res = _mm512_or_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_or_epi64()
+{
+    int64_t *a = g_test_mm512_or_epi64_data.a;
+    int64_t *b = g_test_mm512_or_epi64_data.b;
+    int64_t *expect = g_test_mm512_or_epi64_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+        mb.vect_s64[iCount] = vld1q_s64(b + iCount * 2);
+    }
+    __m512i res = _mm512_or_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_xor_ps()
+{
+    uint32_t *a = g_test_mm512_xor_ps_data.a;
+    uint32_t *b = g_test_mm512_xor_ps_data.b;
+    uint32_t *expect = g_test_mm512_xor_ps_data.expect;
+    int iCount;
+    __m512 ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vreinterpretq_f32_u32(vld1q_u32(a + iCount * 4));
+        mb.vect_f32[iCount] = vreinterpretq_f32_u32(vld1q_u32(b + iCount * 4));
+    }
+    __m512 res = _mm512_xor_ps(ma, mb);
+
+    return comp_return(expect, &res, sizeof(__m512));
+}
+
+int test_mm512_xor_pd()
+{
+    uint64_t *a = g_test_mm512_xor_pd_data.a;
+    uint64_t *b = g_test_mm512_xor_pd_data.b;
+    uint64_t *expect = g_test_mm512_xor_pd_data.expect;
+    int iCount;
+    union {
+        __m512d f;
+        __m512i i;
+    }ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.i.vect_u64[iCount] = vld1q_u64(a + iCount * 2);
+        mb.i.vect_u64[iCount] = vld1q_u64(b + iCount * 2);
+    }
+    __m512d res = _mm512_xor_pd(ma.f, mb.f);
+
+    return comp_return(expect, &res, sizeof(__m512d));
+}
+
+int test_mm256_cmpeq_epi8()
+{
+    int8_t *a = g_test_mm256_cmpeq_epi8_data.a;
+    int8_t *b = g_test_mm256_cmpeq_epi8_data.b;
+    uint8_t *expect = g_test_mm256_cmpeq_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m256i res = _mm256_cmpeq_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_cmpeq_epi32()
+{
+    int32_t *a = g_test_mm256_cmpeq_epi32_data.a;
+    int32_t *b = g_test_mm256_cmpeq_epi32_data.b;
+    uint32_t *expect = g_test_mm256_cmpeq_epi32_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __m256i res = _mm256_cmpeq_epi32(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm_cmpeq_epi64()
+{
+    int64_t *a = g_test_mm_cmpeq_epi64_data.a;
+    int64_t *b = g_test_mm_cmpeq_epi64_data.b;
+    uint64_t *expect = g_test_mm_cmpeq_epi64_data.expect;
+    __m128i ma, mb;
+    ma.vect_s64 = vld1q_s64(a);
+    mb.vect_s64 = vld1q_s64(b);
+    __m128i res = _mm_cmpeq_epi64(ma, mb);
+    return comp_return(expect, &res, sizeof(__m128i));
+}
+
+int test_mm512_cmp_epi32_mask()
+{
+    int32_t *a = g_test_mm512_cmp_epi32_mask_data.a;
+    int32_t *b = g_test_mm512_cmp_epi32_mask_data.b;
+    __mmask16 *expect = g_test_mm512_cmp_epi32_mask_data.expect;
+    __m512i ma, mb;
+    int iCount;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+        mb.vect_s32[iCount] = vld1q_s32(b + iCount * 4);
+    }
+    __mmask16 res[8];
+    res[0] = _mm512_cmp_epi32_mask(ma, mb, _MM_CMPINT_EQ);
+    res[1] = _mm512_cmp_epi32_mask(ma, mb, _MM_CMPINT_LT);
+    res[2] = _mm512_cmp_epi32_mask(ma, mb, _MM_CMPINT_LE);
+    res[3] = _mm512_cmp_epi32_mask(ma, mb, _MM_CMPINT_FALSE);
+    res[4] = _mm512_cmp_epi32_mask(ma, mb, _MM_CMPINT_NE);
+    res[5] = _mm512_cmp_epi32_mask(ma, mb, _MM_CMPINT_NLT);
+    res[6] = _mm512_cmp_epi32_mask(ma, mb, _MM_CMPINT_NLE);
+    res[7] = _mm512_cmp_epi32_mask(ma, mb, _MM_CMPINT_TRUE);
+    return comp_return(expect, res, 8 * sizeof(__mmask16));
+}
+
+int test_mm512_cmp_epi8_mask()
+{
+    int8_t *a = g_test_mm512_cmp_epi8_mask_data.a;
+    int8_t *b = g_test_mm512_cmp_epi8_mask_data.b;
+    __mmask64 *expect = g_test_mm512_cmp_epi8_mask_data.expect;
+    __m512i ma, mb;
+    int iCount;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __mmask64 res[8];
+    res[0] = _mm512_cmp_epi8_mask(ma, mb, _MM_CMPINT_EQ);
+    res[1] = _mm512_cmp_epi8_mask(ma, mb, _MM_CMPINT_LT);
+    res[2] = _mm512_cmp_epi8_mask(ma, mb, _MM_CMPINT_LE);
+    res[3] = _mm512_cmp_epi8_mask(ma, mb, _MM_CMPINT_FALSE);
+    res[4] = _mm512_cmp_epi8_mask(ma, mb, _MM_CMPINT_NE);
+    res[5] = _mm512_cmp_epi8_mask(ma, mb, _MM_CMPINT_NLT);
+    res[6] = _mm512_cmp_epi8_mask(ma, mb, _MM_CMPINT_NLE);
+    res[7] = _mm512_cmp_epi8_mask(ma, mb, _MM_CMPINT_TRUE);
+    return comp_return(expect, res, 8 * sizeof(__mmask64));
+}
+
+int test_mm512_cmpeq_epi8_mask()
+{
+    int8_t *a = g_test_mm512_cmpeq_epi8_mask_data.a;
+    int8_t *b = g_test_mm512_cmpeq_epi8_mask_data.b;
+    __mmask64 expect = g_test_mm512_cmpeq_epi8_mask_data.expect;
+    __m512i ma, mb;
+    int iCount;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __mmask64 res;
+    res = _mm512_cmpeq_epi8_mask(ma, mb);
+    return res == expect;
+}
+
+int test_mm512_mask_cmpeq_epi8_mask()
+{
+    int8_t *a = g_test_mm512_mask_cmpeq_epi8_mask_data.a;
+    int8_t *b = g_test_mm512_mask_cmpeq_epi8_mask_data.b;
+    __mmask64 k1 = g_test_mm512_mask_cmpeq_epi8_mask_data.k1;
+    __mmask64 expect = g_test_mm512_mask_cmpeq_epi8_mask_data.expect;
+    __m512i ma, mb;
+    int iCount;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __mmask64 res;
+    res = _mm512_mask_cmpeq_epi8_mask(k1, ma, mb);
+    return res == expect;
+}
+
+int test_mm512_set_epi32()
+{
+    int32_t *a = g_test_mm512_set_epi32_data.a;
+    int32_t *expect = g_test_mm512_set_epi32_data.expect;
+    __m512i res = _mm512_set_epi32(a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8], a[7], a[6], a[5], a[4], a[3],
+                                   a[2], a[1], a[0]);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_set_epi64()
+{
+    int64_t *a = g_test_mm512_set_epi64_data.a;
+    int64_t *expect = g_test_mm512_set_epi64_data.expect;
+    __m512i res = _mm512_set_epi64(a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_set1_epi32()
+{
+    int32_t a = g_test_mm512_set1_epi32_data.a;
+    int32_t *expect = g_test_mm512_set1_epi32_data.expect;
+    __m512i res = _mm512_set1_epi32(a);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_set1_epi64()
+{
+    int64_t a = g_test_mm512_set1_epi64_data.a;
+    int64_t *expect = g_test_mm512_set1_epi64_data.expect;
+    __m512i res = _mm512_set1_epi64(a);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_set1_epi8()
+{
+    int8_t a = g_test_mm512_set1_epi8_data.a;
+    int8_t *expect = g_test_mm512_set1_epi8_data.expect;
+    __m512i res = _mm512_set1_epi8((char)a);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+int test_mm512_set_ps()
+{
+    float32_t *a = g_test_mm512_set_ps_data.a;
+    float32_t *expect = g_test_mm512_set_ps_data.expect;
+    __m512 res = _mm512_set_ps(a[15], a[14], a[13], a[12], a[11], a[10], a[9], a[8], a[7], a[6], a[5], a[4], a[3], a[2],
+                               a[1], a[0]);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm512_set_pd()
+{
+    float64_t *a = g_test_mm512_set_pd_data.a;
+    float64_t *expect = g_test_mm512_set_pd_data.expect;
+    __m512d res = _mm512_set_pd(a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+int test_mm512_set1_ps()
+{
+    float32_t a = g_test_mm512_set1_ps_data.a;
+    float32_t *expect = g_test_mm512_set1_ps_data.expect;
+    __m512 res = _mm512_set1_ps(a);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm512_set1_pd()
+{
+    float64_t a = g_test_mm512_set1_pd_data.a;
+    float64_t *expect = g_test_mm512_set1_pd_data.expect;
+    __m512d res = _mm512_set1_pd(a);
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+int test_mm512_setzero_ps()
+{
+    float32_t *expect = g_test_mm512_setzero_ps_data.expect;
+    __m512 res = _mm512_setzero_ps();
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm512_setzero_pd()
+{
+    float64_t *expect = g_test_mm512_setzero_pd_data.expect;
+    __m512d res = _mm512_setzero_pd();
+
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm_move_sd()
+{
+    float64_t *a = g_test_mm_move_sd_data.a;
+    float64_t *b = g_test_mm_move_sd_data.b;
+    float64_t *expect = g_test_mm_move_sd_data.expect;
+    __m128d ma, mb, res;
+    ma = vld1q_f64(a);
+    mb = vld1q_f64(b);
+    res = _mm_move_sd(ma, mb);
+
+    return IsEqualFloat64x2(res, expect, DEFAULT_EPSILON_F64);
+}
+int test_mm_move_ss()
+{
+    float32_t *a = g_test_mm_move_ss_data.a;
+    float32_t *b = g_test_mm_move_ss_data.b;
+    float32_t *expect = g_test_mm_move_ss_data.expect;
+    __m128 ma, mb, res;
+    ma = vld1q_f32(a);
+    mb = vld1q_f32(b);
+    res = _mm_move_ss(ma, mb);
+
+    return IsEqualFloat32x4(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm256_movemask_epi8()
+{
+    int8_t *a = g_test_mm256_movemask_epi8_data.a;
+    int expect = g_test_mm256_movemask_epi8_data.expect;
+    int iCount;
+    __m256i ma;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+    }
+    int res = _mm256_movemask_epi8(ma);
+
+    return res == expect;
+}
+int test_mm256_movemask_ps()
+{
+    float32_t *a = g_test_mm256_movemask_ps_data.a;
+    int expect = g_test_mm256_movemask_ps_data.expect;
+    int iCount;
+    __m256 ma;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+    }
+    int res = _mm256_movemask_ps(ma);
+
+    return res == expect;
+}
+
+int test_mm_testz_si128()
+{
+    int8_t *a = g_test_mm_testz_si128_data.a;
+    int8_t *b = g_test_mm_testz_si128_data.b;
+    int expect = g_test_mm_testz_si128_data.expect;
+    __m128i ma, mb;
+    ma.vect_s8 = vld1q_s8(a);
+    mb.vect_s8 = vld1q_s8(b);
+    int res = _mm_testz_si128(ma, mb);
+
+    return res == expect;
+}
+int test_mm256_testz_si256()
+{
+    int8_t *a = g_test_mm256_testz_si256_data.a;
+    int8_t *b = g_test_mm256_testz_si256_data.b;
+    int expect = g_test_mm256_testz_si256_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    int res = _mm256_testz_si256(ma, mb);
+    return res == expect;
+}
+int test_mm512_movm_epi8()
+{
+    __mmask64 a = g_test_mm512_movm_epi8_data.a;
+    int8_t *expect = g_test_mm512_movm_epi8_data.expect;
+    __m512i res = _mm512_movm_epi8(a);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm_extract_epi32()
+{
+    int32_t *a = g_test_mm_extract_epi32_data.a;
+    const int b = g_test_mm_extract_epi32_data.b;
+    int32_t expect = g_test_mm_extract_epi32_data.expect;
+    __m128i ma;
+    ma.vect_s32 = vld1q_s32(a);
+
+    int res = _mm_extract_epi32(ma, b);
+    return expect == res;
+}
+int test_mm_extract_epi64()
+{
+    int64_t *a = g_test_mm_extract_epi64_data.a;
+    const int b = g_test_mm_extract_epi64_data.b;
+    int64_t expect = g_test_mm_extract_epi64_data.expect;
+    __m128i ma;
+    ma.vect_s64 = vld1q_s64(a);
+
+    int64_t res = _mm_extract_epi64(ma, b);
+    return expect == res;
+}
+int test_mm256_extracti128_si256()
+{
+    int64_t *a = g_test_mm256_extracti128_si256_data.a;
+    const int b = g_test_mm256_extracti128_si256_data.b;
+    int64_t *expect = g_test_mm256_extracti128_si256_data.expect;
+
+    int iCount;
+    __m256i ma;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+
+    __m128i res = _mm256_extracti128_si256(ma, b);
+    return comp_return(expect, &res, sizeof(__m128i));
+}
+
+int test_mm_extract_ps()
+{
+    float32_t *a = g_test_mm_extract_ps_data.a;
+    const int b = g_test_mm_extract_ps_data.b;
+    int expect = g_test_mm_extract_ps_data.expect;
+    __m128 ma;
+    ma = vld1q_f32(a);
+    int res = _mm_extract_ps(ma, b);
+
+    return expect == res;
+}
+
+int test_mm256_extract_epi32()
+{
+    int32_t *a = g_test_mm256_extract_epi32_data.a;
+    const int b = g_test_mm256_extract_epi32_data.b;
+    int32_t expect = g_test_mm256_extract_epi32_data.expect;
+    __m256i ma;
+    int iCount;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; ++iCount) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+    }
+    int32_t res = _mm256_extract_epi32(ma, b);
+
+    return res == expect;
+}
+int test_mm256_extract_epi64()
+{
+    int64_t *a = g_test_mm256_extract_epi64_data.a;
+    const int b = g_test_mm256_extract_epi64_data.b;
+    int64_t expect = g_test_mm256_extract_epi64_data.expect;
+    __m256i ma;
+    int iCount;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; ++iCount) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    int64_t res = _mm256_extract_epi64(ma, b);
+
+    return res == expect;
+}
+int test_mm256_extractf128_ps()
+{
+    float32_t *a = g_test_mm256_extractf128_ps_data.a;
+    const int b = g_test_mm256_extractf128_ps_data.b;
+    float32_t *expect = g_test_mm256_extractf128_ps_data.expect;
+    __m256 ma;
+    int iCount;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; ++iCount) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+    }
+    __m128 res = _mm256_extractf128_ps(ma, b);
+
+    return IsEqualFloat32x4(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm256_extractf128_pd()
+{
+    float64_t *a = g_test_mm256_extractf128_pd_data.a;
+    const int b = g_test_mm256_extractf128_pd_data.b;
+    float64_t *expect = g_test_mm256_extractf128_pd_data.expect;
+    __m256d ma;
+    int iCount;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; ++iCount) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+    }
+    __m128d res = _mm256_extractf128_pd(ma, b);
+
+    return IsEqualFloat64x2(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_extractf32x8_ps()
+{
+    float32_t *a = g_test_mm512_extractf32x8_ps_data.a;
+    const int b = g_test_mm512_extractf32x8_ps_data.b;
+    float32_t *expect = g_test_mm512_extractf32x8_ps_data.expect;
+    __m512 ma;
+    int iCount;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; ++iCount) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+    }
+    __m256 res = _mm512_extractf32x8_ps(ma, b);
+
+    return IsEqualFloat32x8(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_extractf64x4_pd()
+{
+    float64_t *a = g_test_mm512_extractf64x4_pd_data.a;
+    const int b = g_test_mm512_extractf64x4_pd_data.b;
+    float64_t *expect = g_test_mm512_extractf64x4_pd_data.expect;
+    __m512d ma;
+    int iCount;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; ++iCount) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+    }
+    __m256d res = _mm512_extractf64x4_pd(ma, b);
+
+    return IsEqualFloat64x4(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm_crc32_u8()
+{
+    unsigned int crc = g_test_mm_crc32_u8_data.crc;
+    unsigned char v = g_test_mm_crc32_u8_data.v;
+    unsigned int expect = g_test_mm_crc32_u8_data.expect;
+    unsigned int res = _mm_crc32_u8(crc, v);
+    return res == expect;
+}
+int test_mm_crc32_u16()
+{
+    unsigned int crc = g_test_mm_crc32_u16_data.crc;
+    unsigned short v = g_test_mm_crc32_u16_data.v;
+    unsigned int expect = g_test_mm_crc32_u16_data.expect;
+    unsigned int res = _mm_crc32_u16(crc, v);
+    return res == expect;
+}
+int test_mm_crc32_u32()
+{
+    unsigned int crc = g_test_mm_crc32_u32_data.crc;
+    unsigned int v = g_test_mm_crc32_u32_data.v;
+    unsigned int expect = g_test_mm_crc32_u32_data.expect;
+    unsigned int res = _mm_crc32_u32(crc, v);
+    return res == expect;
+}
+int test_mm_crc32_u64()
+{
+    unsigned __int64 crc = g_test_mm_crc32_u64_data.crc;
+    unsigned __int64 v = g_test_mm_crc32_u64_data.v;
+    unsigned __int64 expect = g_test_mm_crc32_u64_data.expect;
+    unsigned __int64 res = _mm_crc32_u64(crc, v);
+    return res == expect;
+}
+
+int test_mm256_shuffle_epi8()
+{
+    int8_t *a = g_test_mm256_shuffle_epi8_data.a;
+    int8_t *b = g_test_mm256_shuffle_epi8_data.b;
+    int8_t *expect = g_test_mm256_shuffle_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m256i res = _mm256_shuffle_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_shuffle_epi8()
+{
+    int8_t *a = g_test_mm512_shuffle_epi8_data.a;
+    int8_t *b = g_test_mm512_shuffle_epi8_data.b;
+    int8_t *expect = g_test_mm512_shuffle_epi8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m512i res = _mm512_shuffle_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_maskz_shuffle_epi8()
+{
+    int8_t *a = g_test_mm512_maskz_shuffle_epi8_data.a;
+    int8_t *b = g_test_mm512_maskz_shuffle_epi8_data.b;
+    __mmask64 k = g_test_mm512_maskz_shuffle_epi8_data.k;
+    int8_t *expect = g_test_mm512_maskz_shuffle_epi8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m512i res = _mm512_maskz_shuffle_epi8(k, ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_multishift_epi64_epi8()
+{
+    uint8_t *a = g_test_mm256_multishift_epi64_epi8_data.a;
+    uint8_t *b = g_test_mm256_multishift_epi64_epi8_data.b;
+    uint8_t *expect = g_test_mm256_multishift_epi64_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_u8[iCount] = vld1q_u8(a + iCount * 16);
+        mb.vect_u8[iCount] = vld1q_u8(b + iCount * 16);
+    }
+    __m256i res = _mm256_multishift_epi64_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_multishift_epi64_epi8()
+{
+    uint8_t *a = g_test_mm512_multishift_epi64_epi8_data.a;
+    uint8_t *b = g_test_mm512_multishift_epi64_epi8_data.b;
+    uint8_t *expect = g_test_mm512_multishift_epi64_epi8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_u8[iCount] = vld1q_u8(a + iCount * 16);
+        mb.vect_u8[iCount] = vld1q_u8(b + iCount * 16);
+    }
+    __m512i res = _mm512_multishift_epi64_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_unpacklo_epi8()
+{
+    int8_t *a = g_test_mm256_unpacklo_epi8_data.a;
+    int8_t *b = g_test_mm256_unpacklo_epi8_data.b;
+    int8_t *expect = g_test_mm256_unpacklo_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m256i res = _mm256_unpacklo_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_unpackhi_epi8()
+{
+    int8_t *a = g_test_mm256_unpackhi_epi8_data.a;
+    int8_t *b = g_test_mm256_unpackhi_epi8_data.b;
+    int8_t *expect = g_test_mm256_unpackhi_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m256i res = _mm256_unpackhi_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_unpacklo_epi8()
+{
+    int8_t *a = g_test_mm512_unpacklo_epi8_data.a;
+    int8_t *b = g_test_mm512_unpacklo_epi8_data.b;
+    int8_t *expect = g_test_mm512_unpacklo_epi8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m512i res = _mm512_unpacklo_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_unpackhi_epi8()
+{
+    int8_t *a = g_test_mm512_unpackhi_epi8_data.a;
+    int8_t *b = g_test_mm512_unpackhi_epi8_data.b;
+    int8_t *expect = g_test_mm512_unpackhi_epi8_data.expect;
+    int iCount;
+    __m512i ma, mb;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m512i res = _mm512_unpackhi_epi8(ma, mb);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm_set_pd()
+{
+    float64_t *src = g_test_mm_set_pd_data.a;
+    __m128d dst = _mm_set_pd(src[1], src[0]);
+
+    return comp_return(g_test_mm_set_pd_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_set_epi32()
+{
+    int32_t *src = g_test_mm256_set_epi32_data.a;
+    __m256i dst = _mm256_set_epi32(src[7], src[6], src[5], src[4], src[3], src[2], src[1], src[0]);
+
+    return comp_return(g_test_mm256_set_epi32_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_set_epi64x()
+{
+    int64_t *src = g_test_mm256_set_epi64x_data.a;
+    __m256i dst = _mm256_set_epi64x(src[3], src[2], src[1], src[0]);
+
+    return comp_return(g_test_mm256_set_epi64x_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_set_m128i()
+{
+    int32_t *src = g_test_mm256_set_m128i_data.a;
+    __m128i low = _mm_set_epi32(src[3], src[2], src[1], src[0]);
+    __m128i high = _mm_set_epi32(src[7], src[6], src[5], src[4]);
+    __m256i dst = _mm256_set_m128i(high, low);
+
+    return comp_return(g_test_mm256_set_m128i_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_set_ps()
+{
+    float32_t *src = g_test_mm256_set_ps_data.a;
+    __m256 dst = _mm256_set_ps(src[7], src[6], src[5], src[4], src[3], src[2], src[1], src[0]);
+
+    return comp_return(g_test_mm256_set_ps_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_set_pd()
+{
+    float64_t *src = g_test_mm256_set_pd_data.a;
+    __m256d dst = _mm256_set_pd(src[3], src[2], src[1], src[0]);
+
+    return comp_return(g_test_mm256_set_pd_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_setzero_si256()
+{
+    __m256i dst = _mm256_setzero_si256();
+
+    return comp_return(g_test_mm256_setzero_si256_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_setzero_ps()
+{
+    __m256 dst = _mm256_setzero_ps();
+
+    return comp_return(g_test_mm256_setzero_ps_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_setzero_pd()
+{
+    __m256d dst = _mm256_setzero_pd();
+
+    return comp_return(g_test_mm256_setzero_pd_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm_set1_epi64x()
+{
+    __m128i dst = _mm_set1_epi64x(g_test_mm_set1_epi64x_data.a);
+
+    return comp_return(g_test_mm_set1_epi64x_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm_set1_pd()
+{
+    __m128d dst = _mm_set1_pd(g_test_mm_set1_pd_data.a);
+
+    return comp_return(g_test_mm_set1_pd_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_set1_epi8()
+{
+    __m256i dst = _mm256_set1_epi8(g_test_mm256_set1_epi8_data.a);
+
+    return comp_return(g_test_mm256_set1_epi8_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_set1_epi32()
+{
+    __m256i dst = _mm256_set1_epi32(g_test_mm256_set1_epi32_data.a);
+
+    return comp_return(g_test_mm256_set1_epi32_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_set1_epi64x()
+{
+    __m256i dst = _mm256_set1_epi64x(g_test_mm256_set1_epi64x_data.a);
+
+    return comp_return(g_test_mm256_set1_epi64x_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_set1_pd()
+{
+    __m256d dst = _mm256_set1_pd(g_test_mm256_set1_pd_data.a);
+
+    return comp_return(g_test_mm256_set1_pd_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_set1_ps()
+{
+    __m256 dst = _mm256_set1_ps(g_test_mm256_set1_ps_data.a);
+
+    return comp_return(g_test_mm256_set1_ps_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_load_si256()
+{
+    int32_t *src = g_test_mm256_load_si256_data.a;
+    __m256i data = _mm256_set_epi32(src[7], src[6], src[5], src[4], src[3], src[2], src[1], src[0]);
+    __m256i dst = _mm256_load_si256(&data);
+
+    return comp_return(g_test_mm256_load_si256_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_loadu_si256()
+{
+    int32_t *src = g_test_mm256_loadu_si256_data.a;
+    __m256i data = _mm256_set_epi32(src[7], src[6], src[5], src[4], src[3], src[2], src[1], src[0]);
+    __m256i dst = _mm256_loadu_si256(&data);
+
+    return comp_return(g_test_mm256_loadu_si256_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_maskload_epi32()
+{
+    int32_t *src = g_test_mm256_maskload_epi32_data.a;
+    int32_t *maskSrc = g_test_mm256_maskload_epi32_data.mask;
+
+    __m256i mask = _mm256_set_epi32(maskSrc[7], maskSrc[6], maskSrc[5], maskSrc[4], maskSrc[3], maskSrc[2], maskSrc[1],
+                                    maskSrc[0]);
+    __m256i dst = _mm256_maskload_epi32(src, mask);
+
+    return comp_return(g_test_mm256_maskload_epi32_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm512_load_si512()
+{
+    int32_t *src = g_test_mm512_load_si512_data.a;
+    __m512i dst;
+
+    dst = _mm512_load_si512(src);
+
+    return comp_return(g_test_mm512_load_si512_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm512_loadu_si512()
+{
+    __m512i dst = _mm512_loadu_si512(g_test_mm512_loadu_si512_data.a);
+
+    return comp_return(g_test_mm512_loadu_si512_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm512_mask_loadu_epi8()
+{
+    __m512i src = _mm512_set1_epi8(g_test_mm512_mask_loadu_epi8_data.src);
+    int8_t *mem_addr = g_test_mm512_mask_loadu_epi8_data.mem_addr;
+    unsigned long long mask = g_test_mm512_mask_loadu_epi8_data.mask;
+    __m512i dst = _mm512_mask_loadu_epi8(src, mask, mem_addr);
+
+    return comp_return(g_test_mm512_mask_loadu_epi8_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm512_maskz_loadu_epi8()
+{
+    int8_t *mem_addr = g_test_mm512_maskz_loadu_epi8_data.mem_addr;
+    unsigned long long k = g_test_mm512_maskz_loadu_epi8_data.k;
+    __m512i dst = _mm512_maskz_loadu_epi8(k, mem_addr);
+
+    return comp_return(g_test_mm512_maskz_loadu_epi8_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm512_abs_epi8()
+{
+    __m512i a;
+    __m512i dst;
+
+    a = _mm512_loadu_si512(g_test_mm512_abs_epi8_data.a);
+    dst = _mm512_abs_epi8(a);
+
+    return comp_return(g_test_mm512_abs_epi8_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_broadcastq_epi64()
+{
+    int64_t *src = g_test_mm256_broadcastq_epi64_data.a;
+    __m128i a = _mm_set_epi64x(src[1], src[0]);
+    __m256i dst = _mm256_broadcastq_epi64(a);
+
+    return comp_return(g_test_mm256_broadcastq_epi64_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_broadcastsi128_si256()
+{
+    int64_t *src = g_test_mm256_broadcastsi128_si256_data.a;
+
+    __m128i a = _mm_set_epi64x(src[1], src[0]);
+    __m256i dst = _mm256_broadcastsi128_si256(a);
+
+    return comp_return(g_test_mm256_broadcastsi128_si256_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm512_broadcast_i32x4()
+{
+    int32_t *src = g_test_mm512_broadcast_i32x4_data.a;
+
+    __m128i a = _mm_set_epi32(src[3], src[2], src[1], src[0]);
+    __m512i dst = _mm512_broadcast_i32x4(a);
+
+    return comp_return(g_test_mm512_broadcast_i32x4_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm512_broadcast_i64x4()
+{
+    int64_t *src = g_test_mm512_broadcast_i64x4_data.a;
+
+    __m256i a = _mm256_set_epi64x(src[3], src[2], src[1], src[0]);
+    __m512i dst = _mm512_broadcast_i64x4(a);
+
+    return comp_return(g_test_mm512_broadcast_i64x4_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm512_mask_broadcast_i64x4()
+{
+    __mmask8 k = g_test_mm512_mask_broadcast_i64x4_data.k;
+    __m512i src = _mm512_loadu_si512(g_test_mm512_mask_broadcast_i64x4_data.src);
+    int64_t *addr = g_test_mm512_mask_broadcast_i64x4_data.a;
+    __m256i a = _mm256_set_epi64x(addr[3], addr[2], addr[1], addr[0]);
+    __m512i dst = _mm512_mask_broadcast_i64x4(src, k, a);
+
+    return comp_return(g_test_mm512_mask_broadcast_i64x4_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_castpd128_pd256()
+{
+    __m128d a = vld1q_f64(g_test_mm256_castpd128_pd256_data.a);
+    __m256d dst = _mm256_castpd128_pd256(a);
+
+    return IsEqualFloat64x2(dst.vect_f64[0], g_test_mm256_castpd128_pd256_data.expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm256_castpd256_pd128()
+{
+    float64_t *src = g_test_mm256_castpd256_pd128_data.a;
+    __m256d a = _mm256_set_pd(src[3], src[2], src[1], src[0]);
+    __m128d dst = _mm256_castpd256_pd128(a);
+
+    return comp_return(g_test_mm256_castpd256_pd128_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_castps128_ps256()
+{
+    __m128 a = vld1q_f32(g_test_mm256_castps128_ps256_data.a);
+    __m256 dst = _mm256_castps128_ps256(a);
+
+    return IsEqualFloat32x4(dst.vect_f32[0], g_test_mm256_castps128_ps256_data.expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm256_castps256_ps128()
+{
+    float32_t *src = g_test_mm256_castps256_ps128_data.a;
+    __m256 a = _mm256_set_ps(src[7], src[6], src[5], src[4], src[3], src[2], src[1], src[0]);
+
+    __m128 dst = _mm256_castps256_ps128(a);
+
+    return comp_return(g_test_mm256_castps256_ps128_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_castsi128_si256()
+{
+    int32_t *src = g_test_mm256_castsi128_si256_data.a;
+    __m128i a = _mm_set_epi32(src[3], src[2], src[1], src[0]);
+    __m256i dst = _mm256_castsi128_si256(a);
+
+    return comp_return(g_test_mm256_castsi128_si256_data.expect, &dst, sizeof(__m128i));
+}
+
+int test_mm256_castsi256_ps()
+{
+    int32_t *src = g_test_mm256_castsi256_ps_data.a;
+
+    __m256i a = _mm256_set_epi32(src[7], src[6], src[5], src[4], src[3], src[2], src[1], src[0]);
+    __m256 dst = _mm256_castsi256_ps(a);
+
+    return comp_return(g_test_mm256_castsi256_ps_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_castsi256_si128()
+{
+    int32_t *src = g_test_mm256_castsi256_si128_data.a;
+    __m256i a = _mm256_set_epi32(src[7], src[6], src[5], src[4], src[3], src[2], src[1], src[0]);
+    __m128i dst = _mm256_castsi256_si128(a);
+
+    return comp_return(g_test_mm256_castsi256_si128_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_cvtepi32_pd()
+{
+    int32_t *src = g_test_mm256_cvtepi32_pd_data.a;
+
+    __m128i a = _mm_set_epi32(src[3], src[2], src[1], src[0]);
+    __m256d dst = _mm256_cvtepi32_pd(a);
+
+    return comp_return(g_test_mm256_cvtepi32_pd_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_cvtepi32_ps()
+{
+    int32_t *src = g_test_mm256_cvtepi32_ps_data.a;
+
+    __m256i a = _mm256_set_epi32(src[7], src[6], src[5], src[4], src[3], src[2], src[1], src[0]);
+    __m256 dst = _mm256_cvtepi32_ps(a);
+
+    return comp_return(g_test_mm256_cvtepi32_ps_data.expect, &dst, sizeof(dst));
+}
+
+int test_mm256_store_si256()
+{
+    int32_t *a = g_test_mm256_store_si256_data.a;
+    int32_t *expect = g_test_mm256_store_si256_data.expect;
+    int iCount;
+    __m256i ma, res;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+    }
+    _mm256_store_si256(&res, ma);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_storeu_si256()
+{
+    int32_t *a = g_test_mm256_storeu_si256_data.a;
+    int32_t *expect = g_test_mm256_storeu_si256_data.expect;
+    int iCount;
+    __m256i ma, res;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+    }
+    _mm256_storeu_si256(&res, ma);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_store_si512()
+{
+    int32_t *a = g_test_mm512_store_si512_data.a;
+    int32_t *expect = g_test_mm512_store_si512_data.expect;
+    int iCount;
+    __m512i ma, res;
+    for (iCount = 0; iCount < g_512bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+    }
+    _mm512_store_si512(&res, ma);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_inserti128_si256()
+{
+    int32_t *a = g_test_mm256_inserti128_si256_data.a;
+    int32_t *b = g_test_mm256_inserti128_si256_data.b;
+    int32_t *expect = g_test_mm256_inserti128_si256_data.expect;
+    int iCount;
+    __m256i ma, res;
+    __m128i mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s32[iCount] = vld1q_s32(a + iCount * 4);
+    }
+    mb.vect_s32 = vld1q_s32(b);
+    res = _mm256_inserti128_si256(ma, mb, 0);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_insertf128_pd()
+{
+    float64_t *a = g_test_mm256_insertf128_pd_data.a;
+    float64_t *b = g_test_mm256_insertf128_pd_data.b;
+    int imm = g_test_mm256_insertf128_pd_data.imm;
+    float64_t *expect = g_test_mm256_insertf128_pd_data.expect;
+    int iCount;
+    __m256d ma, res;
+    __m128d mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f64[iCount] = vld1q_f64(a + iCount * 2);
+    }
+    mb = vld1q_f64(b);
+    res = _mm256_insertf128_pd(ma, mb, imm);
+
+    return IsEqualFloat64x4(res, expect, DEFAULT_EPSILON_F64);
+}
+int test_mm256_insertf128_ps()
+{
+    float32_t *a = g_test_mm256_insertf128_ps_data.a;
+    float32_t *b = g_test_mm256_insertf128_ps_data.b;
+    int imm = g_test_mm256_insertf128_ps_data.imm;
+    float32_t *expect = g_test_mm256_insertf128_ps_data.expect;
+    int iCount;
+    __m256 ma, res;
+    __m128 mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_f32[iCount] = vld1q_f32(a + iCount * 4);
+    }
+    mb = vld1q_f32(b);
+    res = _mm256_insertf128_ps(ma, mb, imm);
+
+    return IsEqualFloat32x8(res, expect, DEFAULT_EPSILON_F32);
+}
+int test_mm256_permute4x64_epi64()
+{
+    int64_t *a = g_test_mm256_permute4x64_epi64_data.a;
+    int imm = g_test_mm256_permute4x64_epi64_data.imm;
+    int64_t *expect = g_test_mm256_permute4x64_epi64_data.expect;
+    int iCount;
+    __m256i ma, res;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s64[iCount] = vld1q_s64(a + iCount * 2);
+    }
+    res = _mm256_permute4x64_epi64(ma, imm);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_alignr_epi8()
+{
+    int8_t *a = g_test_mm256_alignr_epi8_data.a;
+    int8_t *b = g_test_mm256_alignr_epi8_data.b;
+    const int count = g_test_mm256_alignr_epi8_data.count;
+    int8_t *expect = g_test_mm256_alignr_epi8_data.expect;
+    int iCount;
+    __m256i ma, mb;
+    for (iCount = 0; iCount < g_256bit_divto_128bit; iCount++) {
+        ma.vect_s8[iCount] = vld1q_s8(a + iCount * 16);
+        mb.vect_s8[iCount] = vld1q_s8(b + iCount * 16);
+    }
+    __m256i res = _mm256_alignr_epi8(ma, mb, count);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm_cmpestri()
+{
+    int16_t *a = g_test_mm_cmpestri_data_model_data.a;
+    int16_t *b = g_test_mm_cmpestri_data_model_data.b;
+    int la = g_test_mm_cmpestri_data_model_data.la;
+    int lb = g_test_mm_cmpestri_data_model_data.lb;
+    const int imm8 = g_test_mm_cmpestri_data_model_data.imm8;
+    int expect = g_test_mm_cmpestri_data_model_data.expect;
+    __m128i ma, mb;
+    ma.vect_s16 = vld1q_s16(a);
+    mb.vect_s16 = vld1q_s16(b);
+    int res = _mm_cmpestri(ma, la, mb, lb, imm8);
+    return res == expect;
+}
+
+int test_mm_cmpestrm()
+{
+    int16_t *a = g_test_mm_cmpestrm_data_model_data.a;
+    int16_t *b = g_test_mm_cmpestrm_data_model_data.b;
+    int la = g_test_mm_cmpestrm_data_model_data.la;
+    int lb = g_test_mm_cmpestrm_data_model_data.lb;
+    const int imm8 = g_test_mm_cmpestrm_data_model_data.imm8;
+    int16_t *expect = g_test_mm_cmpestrm_data_model_data.expect;
+    __m128i ma, mb, res;
+    ma.vect_s16 = vld1q_s16(a);
+    mb.vect_s16 = vld1q_s16(b);
+    res = _mm_cmpestrm(ma, la, mb, lb, imm8);
+    return comp_return(expect, &res, sizeof(__m128i));
+}
+
+int test_mm_insert_epi32()
+{
+    __m128i a;
+    int i = g_test_mm_insert_epi32_data.i;
+    int32_t *expect = g_test_mm_insert_epi32_data.expect;
+
+    a.vect_s32 = vld1q_s32(g_test_mm_insert_epi32_data.a);
+    __m128i res = _mm_insert_epi32(a, i, 3);
+    return comp_return(expect, &res, sizeof(__m128i));
+}
+
+int test_mm256_insert_epi32()
+{
+    __m256i a;
+    int i = g_test_mm256_insert_epi32_data.i;
+    int32_t *expect = g_test_mm256_insert_epi32_data.expect;
+
+    for (unsigned int j = 0; j < M256_M128_NUM; j++) {
+        a.vect_s32[j] = vld1q_s32(g_test_mm256_insert_epi32_data.a + j * M128I_INT32_NUM);
+    }
+    __m256i res = _mm256_insert_epi32(a, i, 6);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm256_insert_epi64()
+{
+    __m256i a;
+    int64_t i = g_test_mm256_insert_epi64_data.i;
+    int64_t *expect = g_test_mm256_insert_epi64_data.expect;
+
+    for (unsigned int j = 0; j < M256_M128_NUM; j++) {
+        a.vect_s64[j] = vld1q_s64(g_test_mm256_insert_epi64_data.a + j * M128I_INT64_NUM);
+    }
+    __m256i res = _mm256_insert_epi64(a, i, 3);
+    return comp_return(expect, &res, sizeof(__m256i));
+}
+
+int test_mm512_castpd128_pd512()
+{
+    __m128d a = vld1q_f64(g_test_mm512_castpd128_pd512_data.a);
+    __m512d res = _mm512_castpd128_pd512(a);
+
+    return IsEqualFloat64x2(res.vect_f64[0], g_test_mm512_castpd128_pd512_data.expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_castpd512_pd128()
+{
+    __m512d a;
+    double *expect = g_test_mm512_castpd512_pd128_data.expect;
+    __m128d res;
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_f64[i] = vld1q_f64(g_test_mm512_castpd512_pd128_data.a + i * M128D_FLOAT64_NUM);
+    }
+    res = _mm512_castpd512_pd128(a);
+
+    return IsEqualFloat64x2(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_castps128_ps512()
+{
+    __m128 a = vld1q_f32(g_test_mm512_castps128_ps512_data.a);
+    __m512 res = _mm512_castps128_ps512(a);
+
+    return IsEqualFloat32x4(res.vect_f32[0], g_test_mm512_castps128_ps512_data.expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_castps512_ps128()
+{
+    __m512 a;
+    float *expect = g_test_mm512_castps512_ps128_data.expect;
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_f32[i] = vld1q_f32(g_test_mm512_castps512_ps128_data.a + i * M128_FLOAT32_NUM);
+    }
+    __m128 res = _mm512_castps512_ps128(a);
+
+    return IsEqualFloat32x4(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_cvtepi32_pd()
+{
+    __m256i a;
+    double *expect = g_test_mm512_cvtepi32_pd_data.expect;
+
+    for (unsigned int i = 0; i < M256_M128_NUM; i++) {
+        a.vect_s32[i] = vld1q_s32(g_test_mm512_cvtepi32_pd_data.a + i * M128I_INT32_NUM);
+    }
+    __m512d res = _mm512_cvtepi32_pd(a);
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_cvtepi32_ps()
+{
+    __m512i a;
+    float *expect = g_test_mm512_cvtepi32_ps_data.expect;
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_s32[i] = vld1q_s32(g_test_mm512_cvtepi32_ps_data.a + i * M128I_INT32_NUM);
+    }
+    __m512 res = _mm512_cvtepi32_ps(a);
+
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_insertf32x8()
+{
+    __m512 a;
+    __m256 b;
+    int imm8 = g_test_mm512_insertf32x8_data.imm8;
+    float *expect = g_test_mm512_insertf32x8_data.expect;
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_f32[i] = vld1q_f32(g_test_mm512_insertf32x8_data.a + i * M128_FLOAT32_NUM);
+    }
+    for (unsigned int i = 0; i < M256_M128_NUM; i++) {
+        b.vect_f32[i] = vld1q_f32(g_test_mm512_insertf32x8_data.b + i * M128_FLOAT32_NUM);
+    }
+    __m512 res = _mm512_insertf32x8(a, b, imm8);
+    return IsEqualFloat32x16(res, expect, DEFAULT_EPSILON_F32);
+}
+
+int test_mm512_insertf64x4()
+{
+    __m512d a;
+    __m256d b;
+    int imm8 = g_test_mm512_insertf64x4_data.imm8;
+    double *expect = g_test_mm512_insertf64x4_data.expect;
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_f64[i] = vld1q_f64(g_test_mm512_insertf64x4_data.a + i * M128D_FLOAT64_NUM);
+    }
+    for (unsigned int i = 0; i < M256_M128_NUM; i++) {
+        b.vect_f64[i] = vld1q_f64(g_test_mm512_insertf64x4_data.b + i * M128D_FLOAT64_NUM);
+    }
+    __m512d res = _mm512_insertf64x4(a, b, imm8);
+    return IsEqualFloat64x8(res, expect, DEFAULT_EPSILON_F64);
+}
+
+int test_mm512_inserti32x8()
+{
+    __m512i a;
+    __m256i b;
+    int imm8 = g_test_mm512_inserti32x8_data.imm8;
+    int32_t *expect = g_test_mm512_inserti32x8_data.expect;
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_s32[i] = vld1q_s32(g_test_mm512_inserti32x8_data.a + i * M128I_INT32_NUM);
+    }
+    for (unsigned int i = 0; i < M256_M128_NUM; i++) {
+        b.vect_s32[i] = vld1q_s32(g_test_mm512_inserti32x8_data.b + i * M128I_INT32_NUM);
+    }
+    __m512i res = _mm512_inserti32x8(a, b, imm8);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm512_inserti64x4()
+{
+    __m512i a;
+    __m256i b;
+    int imm8 = g_test_mm512_inserti64x4_data.imm8;
+    int64_t *expect = g_test_mm512_inserti64x4_data.expect;
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_s64[i] = vld1q_s64(g_test_mm512_inserti64x4_data.a + i * M128I_INT64_NUM);
+    }
+    for (unsigned int i = 0; i < M256_M128_NUM; i++) {
+        b.vect_s64[i] = vld1q_s64(g_test_mm512_inserti64x4_data.b + i * M128I_INT64_NUM);
+    }
+    __m512i res = _mm512_inserti64x4(a, b, imm8);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
+
+int test_mm256_cmp_pd()
+{
+    __m256d source1, source2, dest;
+    int i;
+    long long expect[4];
+
+    __m256d s1 = test_mm256_cmp_pd_data_model_unordered_data1;
+    __m256d s2 = test_mm256_cmp_pd_data_model_unordered_data2;
+    for (int j = 0; j < 32; j++) {
+        MM256_CMP_PD(j, test_mm256_cmp_pd_data_model_unordered_ret[j][i], expect);
+    }
+
+    s1 = test_mm256_cmp_pd_data_model_ordered_data1;
+    s2 = test_mm256_cmp_pd_data_model_ordered_data2;
+
+    for (int j = 0; j < 32; j++) {
+        MM256_CMP_PD(j, test_mm256_cmp_pd_data_model_ordered_ret[j][i], expect);
+    }
+    return TRUE;
+}
+
+int test_mm256_cmp_ps()
+{
+    __m256 source1, source2, dest;
+    int i;
+    int expect[8];
+
+    __m256 s1 = test_mm256_cmp_ps_data_model_unordered_data1;
+    __m256 s2 = test_mm256_cmp_ps_data_model_unordered_data2;
+    for (int j = 0; j < 32; j++) {
+        MM256_CMP_PS(j, test_mm256_cmp_ps_data_model_unordered_ret[j][i], expect);
+    }
+
+    s1 = test_mm256_cmp_ps_data_model_ordered_data1;
+    s2 = test_mm256_cmp_ps_data_model_ordered_data2;
+
+    for (int j = 0; j < 32; j++) {
+        MM256_CMP_PS(j, test_mm256_cmp_ps_data_model_ordered_ret[j][i], expect);
+    }
+    return TRUE;
+}
+
+int test_mm512_cmp_pd_mask()
+{
+    __m512d a, b;
+    __mmask8 result[32];
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_f64[i] = vld1q_f64(g_test_mm512_cmp_pd_mask_data1.a + i * M128D_FLOAT64_NUM);
+        b.vect_f64[i] = vld1q_f64(g_test_mm512_cmp_pd_mask_data1.b + i * M128D_FLOAT64_NUM);
+    }
+    __mmask8* expect = g_test_mm512_cmp_pd_mask_data1.expect;
+    for (int i = 0; i < 32; i++) {
+        result[i] = _mm512_cmp_pd_mask(a, b, i);
+    }
+    if (!comp_return(result, expect, sizeof(__mmask8) * 32)) {
+        return FALSE;
+    }
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_f64[i] = vld1q_f64(g_test_mm512_cmp_pd_mask_data2.a + i * M128D_FLOAT64_NUM);
+        b.vect_f64[i] = vld1q_f64(g_test_mm512_cmp_pd_mask_data2.b + i * M128D_FLOAT64_NUM);
+    }
+    expect = g_test_mm512_cmp_pd_mask_data2.expect;
+    for (int i = 0; i < 32; i++) {
+        result[i] = _mm512_cmp_pd_mask(a, b, i);
+    }
+    if (!comp_return(result, expect, sizeof(__mmask8) * 32)) {
+        return FALSE;
+    }
+    return TRUE;
+}
+
+int test_mm512_cmp_ps_mask()
+{
+    __m512 a, b;
+    __mmask16 result[32];
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_f32[i] = vld1q_f32(g_test_mm512_cmp_ps_mask_data1.a + i * M128_FLOAT32_NUM);
+        b.vect_f32[i] = vld1q_f32(g_test_mm512_cmp_ps_mask_data1.b + i * M128_FLOAT32_NUM);
+    }
+    __mmask16* expect = g_test_mm512_cmp_ps_mask_data1.expect;
+    for (int i = 0; i < 32; i++) {
+        result[i] = _mm512_cmp_ps_mask(a, b, i);
+    }
+    if (!comp_return(result, expect, sizeof(__mmask16) * 32)) {
+        return FALSE;
+    }
+    
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_f32[i] = vld1q_f32(g_test_mm512_cmp_ps_mask_data2.a + i * M128_FLOAT32_NUM);
+        b.vect_f32[i] = vld1q_f32(g_test_mm512_cmp_ps_mask_data2.b + i * M128_FLOAT32_NUM);
+    }
+    expect = g_test_mm512_cmp_ps_mask_data2.expect;
+    for (int i = 0; i < 32; i++) {
+        result[i] = _mm512_cmp_ps_mask(a, b, i);
+    }
+    if (!comp_return(result, expect, sizeof(__mmask16) * 32)) {
+        return FALSE;
+    }
+    return TRUE;
+}
+int test_mm512_permutexvar_epi32()
+{
+    __m512i idx;
+    __m512i a;
+    int32_t *expect = g_test_mm512_permutexvar_epi32_data.expect;
+
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        idx.vect_s32[i] = vld1q_s32(g_test_mm512_permutexvar_epi32_data.idx + i * M128I_INT32_NUM);
+    }
+    for (unsigned int i = 0; i < M512_M128_NUM; i++) {
+        a.vect_s32[i] = vld1q_s32(g_test_mm512_permutexvar_epi32_data.a + i * M128I_INT32_NUM);
+    }
+    __m512i res = _mm512_permutexvar_epi32(idx, a);
+    return comp_return(expect, &res, sizeof(__m512i));
+}
diff --git a/tests/a2ntest.h b/tests/a2ntest.h
new file mode 100644
index 0000000..af90aa0
--- /dev/null
+++ b/tests/a2ntest.h
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved.
+ * Description: avx2neon test head file
+ * Author: xuqimeng
+ * Create: 2019-11-05
+*/
+#ifndef AVX2NEON_TEST_H
+#define AVX2NEON_TEST_H
+
+#define TRUE 1
+#define FALSE 0
+
+#define DEFAULT_EPSILON_F32 1e-4
+#define DEFAULT_EPSILON_F64 1e-8
+
+#include "avx2neon.h"
+
+typedef enum {
+    UT_MM_EXTRACT_EPI32,
+    UT_MM_EXTRACT_EPI64,
+    UT_MM_MOVE_SD,
+    UT_MM_MOVE_SS,
+    UT_MM256_ADD_EPI16,
+    UT_MM256_ADD_EPI32,
+    UT_MM256_ADD_EPI64,
+    UT_MM256_ADD_EPI8,
+    UT_MM256_ADD_PD,
+    UT_MM256_ADD_PS,
+    UT_MM256_ADDS_EPI16,
+    UT_MM256_ADDS_EPI8,
+    UT_MM256_ADDS_EPU16,
+    UT_MM256_ADDS_EPU8,
+    UT_MM256_ADDSUB_PD,
+    UT_MM256_ADDSUB_PS,
+    UT_MM256_BLENDV_PD,
+    UT_MM256_BLENDV_PS,
+    UT_MM256_BLEND_PD,
+    UT_MM256_BLEND_PS,
+    UT_MM512_MASK_BLEND_PD,
+    UT_MM512_MASK_BLEND_PS,
+    UT_MM256_CASTPD128_PD256,
+    UT_MM256_CASTPD256_PD128,
+    UT_MM256_CASTPS128_PS256,
+    UT_MM256_CASTPS256_PS128,
+    UT_MM256_CVTEPI32_PD,
+    UT_MM256_CVTEPI32_PS,
+    UT_MM256_DIV_EPI16,
+    UT_MM256_DIV_EPI32,
+    UT_MM256_DIV_EPI64,
+    UT_MM256_DIV_EPI8,
+    UT_MM256_DIV_EPU16,
+    UT_MM256_DIV_EPU32,
+    UT_MM256_DIV_EPU64,
+    UT_MM256_DIV_EPU8,
+    UT_MM256_DIV_PD,
+    UT_MM256_DIV_PS,
+    UT_MM256_EXTRACT_EPI32,
+    UT_MM256_EXTRACT_EPI64,
+    UT_MM256_EXTRACTF128_PD,
+    UT_MM256_EXTRACTF128_PS,
+    UT_MM256_INSERTF128_PD,
+    UT_MM256_INSERTF128_PS,
+    UT_MM256_MUL_EPI32,
+    UT_MM256_MUL_EPU32,
+    UT_MM256_MUL_PD,
+    UT_MM256_MUL_PS,
+    UT_MM256_MULHI_EPI16,
+    UT_MM256_MULHI_EPU16,
+    UT_MM256_MULHRS_EPI16,
+    UT_MM256_MULLO_EPI16,
+    UT_MM256_MULLO_EPI32,
+    UT_MM256_MULLO_EPI64,
+    UT_MM256_MULTISHIFT_EPI64_EPI8,
+    UT_MM256_SET1_EPI32,
+    UT_MM256_SET1_PD,
+    UT_MM256_SET1_PS,
+    UT_MM256_SUB_EPI16,
+    UT_MM256_SUB_EPI32,
+    UT_MM256_SUB_EPI64,
+    UT_MM256_SUB_EPI8,
+    UT_MM256_SUB_PD,
+    UT_MM256_SUB_PS,
+    UT_MM256_SUBS_EPI16,
+    UT_MM256_SUBS_EPI8,
+    UT_MM256_SUBS_EPU16,
+    UT_MM256_SUBS_EPU8,
+    UT_MM512_ADD_EPI16,
+    UT_MM512_ADD_EPI32,
+    UT_MM512_ADD_EPI64,
+    UT_MM512_ADD_EPI8,
+    UT_MM512_ADD_PD,
+    UT_MM512_ADD_PS,
+    UT_MM512_ADD_ROUND_PD,
+    UT_MM512_ADD_ROUND_PS,
+    UT_MM512_ADDN_PD,
+    UT_MM512_ADDN_PS,
+    UT_MM512_ADDN_ROUND_PD,
+    UT_MM512_ADDN_ROUND_PS,
+    UT_MM512_ADDS_EPI16,
+    UT_MM512_ADDS_EPI8,
+    UT_MM512_ADDS_EPU16,
+    UT_MM512_ADDS_EPU8,
+    UT_MM512_ADDSETC_EPI32,
+    UT_MM512_ADDSETS_EPI32,
+    UT_MM512_ADDSETS_PS,
+    UT_MM512_ADDSETS_ROUND_PS,
+    UT_MM512_DIV_EPI16,
+    UT_MM512_DIV_EPI32,
+    UT_MM512_DIV_EPI64,
+    UT_MM512_DIV_EPI8,
+    UT_MM512_DIV_EPU16,
+    UT_MM512_DIV_EPU32,
+    UT_MM512_DIV_EPU64,
+    UT_MM512_DIV_EPU8,
+    UT_MM512_DIV_PD,
+    UT_MM512_DIV_PS,
+    UT_MM512_DIV_ROUND_PD,
+    UT_MM512_DIV_ROUND_PS,
+    UT_MM512_EXTRACTF32x8_PS,
+    UT_MM512_EXTRACTF64x4_PD,
+    UT_MM512_MUL_EPI32,
+    UT_MM512_MUL_EPU32,
+    UT_MM512_MUL_PD,
+    UT_MM512_MUL_PS,
+    UT_MM512_MUL_ROUND_PD,
+    UT_MM512_MUL_ROUND_PS,
+    UT_MM512_MULHI_EPI16,
+    UT_MM512_MULHI_EPI32,
+    UT_MM512_MULHI_EPU16,
+    UT_MM512_MULHI_EPU32,
+    UT_MM512_MULHRS_EPI16,
+    UT_MM512_MULLO_EPI16,
+    UT_MM512_MULLO_EPI32,
+    UT_MM512_MULLO_EPI64,
+    UT_MM512_MULLOX_EPI64,
+    UT_MM512_MULTISHIFT_EPI64_EPI8,
+    UT_MM512_SUB_EPI16,
+    UT_MM512_SUB_EPI32,
+    UT_MM512_SUB_EPI64,
+    UT_MM512_SUB_EPI8,
+    UT_MM512_SUB_PD,
+    UT_MM512_SUB_PS,
+    UT_MM512_SUB_ROUND_PD,
+    UT_MM512_SUB_ROUND_PS,
+    UT_MM512_SUBR_EPI32,
+    UT_MM512_SUBR_PD,
+    UT_MM512_SUBR_PS,
+    UT_MM512_SUBR_ROUND_PD,
+    UT_MM512_SUBR_ROUND_PS,
+    UT_MM512_SUBRSETB_EPI32,
+    UT_MM512_SUBS_EPI16,
+    UT_MM512_SUBS_EPI8,
+    UT_MM512_SUBS_EPU16,
+    UT_MM512_SUBS_EPU8,
+    UT_MM512_SUBSETB_EPI32,
+    UT_MM256_ALIGNR_EPI8,
+    UT_MM256_AND_SI256,
+    UT_MM256_ANDNOT_SI256,
+    UT_MM256_BROADCASTQ_EPI64,
+    UT_MM256_BROADCASTSI128_SI256,
+    UT_MM256_CASTSI128_SI256,
+    UT_MM256_CASTSI256_PS,
+    UT_MM256_CASTSI256_SI128,
+    UT_MM256_CMPEQ_EPI32,
+    UT_MM256_CMPEQ_EPI8,
+    UT_MM256_EXTRACTI128_SI256,
+    UT_MM256_INSERTI128_SI256,
+    UT_MM256_LOAD_SI256,
+    UT_MM256_LOADU_SI256,
+    UT_MM256_MASKLOAD_EPI32,
+    UT_MM256_MOVEMASK_EPI8,
+    UT_MM256_MOVEMASK_PS,
+    UT_MM256_OR_SI256,
+    UT_MM256_OR_PS,
+    UT_MM256_OR_PD,
+    UT_MM256_PERMUTE4X64_EPI64,
+    UT_MM256_SET_EPI64X,
+    UT_MM256_SET_M128I,
+    UT_MM256_SET1_EPI64X,
+    UT_MM256_SET1_EPI8,
+    UT_MM256_SETZERO_SI256,
+    UT_MM256_SET_PS,
+    UT_MM256_SET_PD,
+    UT_MM256_SETZERO_PS,
+    UT_MM256_SETZERO_PD,
+    UT_MM256_SHUFFLE_EPI8,
+    UT_MM_SLL_EPI64,
+    UT_MM256_SLL_EPI32,
+    UT_MM256_SLL_EPI64,
+    UT_MM256_SLLI_EPI64,
+    UT_MM256_SLLI_SI256,
+    UT_MM256_SLLI_EPI32,
+    UT_MM256_SRLI_EPI64,
+    UT_MM256_SRLI_SI256,
+    UT_MM256_STORE_SI256,
+    UT_MM256_STOREU_SI256,
+    UT_MM256_TESTZ_SI256,
+    UT_MM256_UNPACKHI_EPI8,
+    UT_MM256_UNPACKLO_EPI8,
+    UT_MM256_XOR_SI256,
+    UT_MM256_ZEROUPPER,
+    UT_MM512_ABS_EPI8,
+    UT_MM512_AND_SI512,
+    UT_MM512_ANDNOT_SI512,
+    UT_MM512_BROADCAST_I32X4,
+    UT_MM512_BROADCAST_I64X4,
+    UT_MM512_BSLLI_EPI128,
+    UT_MM512_BSRLI_EPI128,
+    UT_MM512_CMP_EPI32_MASK,
+    UT_MM512_CMP_EPI8_MASK,
+    UT_MM512_CMPEQ_EPI8_MASK,
+    UT_MM512_EXTRACTI32X4_EPI32,
+    UT_MM512_LOAD_SI512,
+    UT_MM512_LOADU_SI512,
+    UT_MM512_MASK_BROADCAST_I64X4,
+    UT_MM512_MASK_CMPEQ_EPI8_MASK,
+    UT_MM512_MASK_LOADU_EPI8,
+    UT_MM512_MASKZ_LOADU_EPI8,
+    UT_MM512_MASKZ_SHUFFLE_EPI8,
+    UT_MM512_MOVM_EPI8,
+    UT_MM512_OR_SI512,
+    UT_MM512_PERMUTEXVAR_EPI64,
+    UT_MM512_SET_EPI32,
+    UT_MM512_SET_EPI64,
+    UT_MM512_SET1_EPI32,
+    UT_MM512_SET1_EPI64,
+    UT_MM512_SET1_EPI8,
+    UT_MM512_SET_PS,
+    UT_MM512_SET_PD,
+    UT_MM512_SET1_PS,
+    UT_MM512_SET1_PD,
+    UT_MM512_SETZERO_PS,
+    UT_MM512_SETZERO_PD,
+    UT_MM512_SHUFFLE_EPI8,
+    UT_MM512_SLL_EPI64,
+    UT_MM512_SLLI_EPI64,
+    UT_MM512_SRLI_EPI64,
+    UT_MM512_STORE_SI512,
+    UT_MM512_TEST_EPI8_MASK,
+    UT_MM512_TEST_EPI32_MASK,
+    UT_MM512_TEST_EPI64_MASK,
+    UT_MM512_UNPACKHI_EPI8,
+    UT_MM512_UNPACKLO_EPI8,
+    UT_MM512_XOR_SI512,
+    UT_MM512_AND_EPI32,
+    UT_MM512_AND_EPI64,
+    UT_MM512_OR_EPI32,
+    UT_MM512_OR_EPI64,
+    UT_MM512_XOR_PS,
+    UT_MM512_XOR_PD,
+    UT_MM_CMPEQ_EPI64,
+    UT_MM_CMPESTRI,
+    UT_MM_CMPESTRM,
+    UT_MM_CRC32_U16,
+    UT_MM_CRC32_U32,
+    UT_MM_CRC32_U64,
+    UT_MM_CRC32_U8,
+    UT_MM_EXTRACT_PS,
+    UT_MM_POPCNT_U32,
+    UT_MM_POPCNT_U64,
+    UT_MM_SET_PD,
+    UT_MM_SET1_EPI64X,
+    UT_MM_SET1_PD,
+    UT_MM_TESTZ_SI128,
+    UT_MM256_CMP_PD,
+    UT_MM256_CMP_PS,
+    UT_MM512_CMP_PD_MASK,
+    UT_MM512_CMP_PS_MASK,
+    UT_MM_INSERT_EPI32,
+    UT_MM256_INSERT_EPI32,
+    UT_MM256_INSERT_EPI64,
+    UT_MM512_CASTPD128_PD512,
+    UT_MM512_CASTPD512_PD128,
+    UT_MM512_CASTPS128_PS512,
+    UT_MM512_CASTPS512_PS128,
+    UT_MM512_CVTEPI32_PD,
+    UT_MM512_CVTEPI32_PS,
+    UT_MM512_INSERTF32X8,
+    UT_MM512_INSERTF64X4,
+    UT_MM512_INSERTI32X8,
+    UT_MM512_INSERTI64X4,
+    UT_MM512_PERMUTEXVAR_EPI32,
+    UT_MM256_SET_EPI32
+} InstructionTest;
+
+const char *RunTest(InstructionTest test, int *flag);
+
+int IsEqualFloat32x4(__m128 a, const float32_t *x, float epsilon);
+int IsEqualFloat64x2(__m128d a, const float64_t *x, float epsilon);
+
+int test_mm_popcnt_u32();
+int test_mm_popcnt_u64();
+int test_mm256_div_epi8();
+int test_mm256_div_epi16();
+int test_mm256_div_epi32();
+int test_mm256_div_epi64();
+int test_mm256_div_epu8();
+int test_mm256_div_epu16();
+int test_mm256_div_epu32();
+int test_mm256_div_epu64();
+int test_mm256_div_ps();
+int test_mm256_div_pd();
+int test_mm512_div_ps();
+int test_mm512_div_pd();
+int test_mm256_add_epi8();
+int test_mm256_add_epi16();
+int test_mm256_add_epi32();
+int test_mm256_add_epi64();
+int test_mm512_add_epi8();
+int test_mm512_add_epi16();
+int test_mm512_add_epi32();
+int test_mm512_add_epi64();
+int test_mm256_adds_epi8();
+int test_mm256_adds_epi16();
+int test_mm256_adds_epu8();
+int test_mm256_adds_epu16();
+int test_mm512_adds_epi8();
+int test_mm512_adds_epi16();
+int test_mm512_adds_epu8();
+int test_mm512_adds_epu16();
+int test_mm256_add_ps();
+int test_mm256_add_pd();
+int test_mm512_add_ps();
+int test_mm512_add_pd();
+int test_mm512_add_round_ps();
+int test_mm512_add_round_pd();
+int test_mm512_addn_ps();
+int test_mm512_addn_pd();
+int test_mm512_addn_round_ps();
+int test_mm512_addn_round_pd();
+int test_mm512_addsetc_epi32();
+int test_mm512_addsets_epi32();
+int test_mm512_addsets_ps();
+int test_mm512_addsets_round_ps();
+int test_mm256_addsub_ps();
+int test_mm256_addsub_pd();
+int test_mm256_blendv_ps();
+int test_mm256_blendv_pd();
+int test_mm256_blend_ps();
+int test_mm256_blend_pd();
+int test_mm512_mask_blend_ps();
+int test_mm512_mask_blend_pd();
+int test_mm256_sub_epi16();
+int test_mm256_sub_epi32();
+int test_mm256_sub_epi64();
+int test_mm256_sub_epi8();
+int test_mm256_sub_pd();
+int test_mm256_sub_ps();
+int test_mm512_sub_epi16();
+int test_mm512_sub_epi32();
+int test_mm512_sub_epi64();
+int test_mm512_sub_epi8();
+int test_mm512_sub_pd();
+int test_mm512_sub_ps();
+int test_mm256_subs_epi16();
+int test_mm256_subs_epi8();
+int test_mm256_subs_epu16();
+int test_mm256_subs_epu8();
+int test_mm512_subs_epi16();
+int test_mm512_subs_epi8();
+int test_mm512_subs_epu16();
+int test_mm512_subs_epu8();
+int test_mm512_sub_round_pd();
+int test_mm512_sub_round_ps();
+int test_mm512_subr_epi32();
+int test_mm512_subr_ps();
+int test_mm512_subr_pd();
+int test_mm512_subr_round_ps();
+int test_mm512_subr_round_pd();
+int test_mm512_subsetb_epi32();
+int test_mm512_subrsetb_epi32();
+int test_mm256_zeroupper();
+int test_mm512_bslli_epi128();
+int test_mm512_bsrli_epi128();
+int test_mm512_permutexvar_epi64();
+int test_mm512_extracti32x4_epi32();
+int test_mm512_test_epi8_mask();
+int test_mm512_test_epi32_mask();
+int test_mm512_test_epi64_mask();
+int test_mm256_mul_epi32();
+int test_mm256_mul_epu32();
+int test_mm256_mul_pd();
+int test_mm256_mul_ps();
+int test_mm256_mulhi_epi16();
+int test_mm256_mulhi_epu16();
+int test_mm512_mul_epi32();
+int test_mm512_mul_epu32();
+int test_mm512_mul_pd();
+int test_mm512_mul_ps();
+int test_mm512_mulhi_epi16();
+int test_mm512_mulhi_epu16();
+int test_mm512_mulhi_epi32();
+int test_mm512_mulhi_epu32();
+int test_mm256_mullo_epi16();
+int test_mm256_mullo_epi32();
+int test_mm256_mullo_epi64();
+int test_mm512_mullo_epi16();
+int test_mm512_mullo_epi32();
+int test_mm512_mullo_epi64();
+int test_mm512_mullox_epi64();
+int test_mm256_mulhrs_epi16();
+int test_mm512_mulhrs_epi16();
+int test_mm512_mul_round_pd();
+int test_mm512_mul_round_ps();
+int test_mm_sll_epi64();
+int test_mm256_sll_epi32();
+int test_mm256_sll_epi64();
+int test_mm512_sll_epi64();
+int test_mm256_slli_epi32();
+int test_mm256_slli_epi64();
+int test_mm512_slli_epi64();
+int test_mm256_srli_epi64();
+int test_mm512_srli_epi64();
+int test_mm256_slli_si256();
+int test_mm256_srli_si256();
+int test_mm256_and_si256();
+int test_mm512_and_si512();
+int test_mm256_or_si256();
+int test_mm512_or_si512();
+int test_mm256_andnot_si256();
+int test_mm512_andnot_si512();
+int test_mm256_xor_si256();
+int test_mm512_xor_si512();
+int test_mm256_or_ps();
+int test_mm256_or_pd();
+int test_mm512_and_epi32();
+int test_mm512_and_epi64();
+int test_mm512_or_epi32();
+int test_mm512_or_epi64();
+int test_mm512_xor_ps();
+int test_mm512_xor_pd();
+int test_mm256_cmpeq_epi8();
+int test_mm256_cmpeq_epi32();
+int test_mm_cmpeq_epi64();
+int test_mm512_set_epi32();
+int test_mm512_set_epi64();
+int test_mm512_set1_epi32();
+int test_mm512_set1_epi64();
+int test_mm512_set1_epi8();
+int test_mm512_set_ps();
+int test_mm512_set_pd();
+int test_mm512_set1_ps();
+int test_mm512_set1_pd();
+int test_mm512_setzero_ps();
+int test_mm512_setzero_pd();
+int test_mm_move_sd();
+int test_mm_move_ss();
+int test_mm256_movemask_epi8();
+int test_mm256_movemask_ps();
+int test_mm_testz_si128();
+int test_mm256_testz_si256();
+int test_mm512_movm_epi8();
+int test_mm_extract_epi32();
+int test_mm_extract_epi64();
+int test_mm256_extracti128_si256();
+int test_mm_extract_ps();
+int test_mm256_extract_epi32();
+int test_mm256_extract_epi64();
+int test_mm256_extractf128_ps();
+int test_mm256_extractf128_pd();
+int test_mm512_extractf32x8_ps();
+int test_mm512_extractf64x4_pd();
+int test_mm_crc32_u8();
+int test_mm_crc32_u16();
+int test_mm_crc32_u32();
+int test_mm_crc32_u64();
+int test_mm256_shuffle_epi8();
+int test_mm512_shuffle_epi8();
+int test_mm512_maskz_shuffle_epi8();
+int test_mm256_multishift_epi64_epi8();
+int test_mm512_multishift_epi64_epi8();
+int test_mm512_cmp_epi32_mask();
+int test_mm512_cmp_epi8_mask();
+int test_mm512_cmpeq_epi8_mask();
+int test_mm512_mask_cmpeq_epi8_mask();
+int test_mm256_unpacklo_epi8();
+int test_mm256_unpackhi_epi8();
+int test_mm512_unpacklo_epi8();
+int test_mm512_unpackhi_epi8();
+int test_mm256_store_si256();
+int test_mm256_storeu_si256();
+int test_mm512_store_si512();
+int test_mm256_inserti128_si256();
+int test_mm256_insertf128_pd();
+int test_mm256_insertf128_ps();
+int test_mm256_permute4x64_epi64();
+int test_mm_set_pd(void);
+int test_mm256_set_epi32(void);
+int test_mm256_set_epi64x(void);
+int test_mm256_set_m128i(void);
+int test_mm256_set_ps(void);
+int test_mm256_set_pd(void);
+int test_mm256_setzero_si256(void);
+int test_mm256_setzero_ps(void);
+int test_mm256_setzero_pd(void);
+int test_mm_set1_epi64x(void);
+int test_mm_set1_pd(void);
+int test_mm256_set1_epi8(void);
+int test_mm256_set1_epi32(void);
+int test_mm256_set1_epi64x(void);
+int test_mm256_set1_pd(void);
+int test_mm256_set1_ps(void);
+int test_mm256_alignr_epi8(void);
+int test_mm256_load_si256(void);
+int test_mm256_loadu_si256(void);
+int test_mm256_maskload_epi32(void);
+int test_mm512_load_si512(void);
+int test_mm512_loadu_si512(void);
+int test_mm512_mask_loadu_epi8(void);
+int test_mm512_maskz_loadu_epi8(void);
+int test_mm512_abs_epi8(void);
+int test_mm256_broadcastq_epi64(void);
+int test_mm256_broadcastsi128_si256(void);
+int test_mm512_broadcast_i32x4(void);
+int test_mm512_broadcast_i64x4(void);
+int test_mm512_mask_broadcast_i64x4(void);
+int test_mm256_castpd128_pd256(void);
+int test_mm256_castpd256_pd128(void);
+int test_mm256_castps128_ps256(void);
+int test_mm256_castps256_ps128(void);
+int test_mm256_castsi128_si256(void);
+int test_mm256_castsi256_ps(void);
+int test_mm256_castsi256_si128(void);
+int test_mm256_cvtepi32_pd(void);
+int test_mm256_cvtepi32_ps(void);
+int test_mm256_alignr_epi8();
+int test_mm_cmpestri();
+int test_mm_cmpestrm();
+int test_mm512_div_epi8();
+int test_mm512_div_epi16();
+int test_mm512_div_epi32();
+int test_mm512_div_epi64();
+int test_mm512_div_epu8();
+int test_mm512_div_epu16();
+int test_mm512_div_epu32();
+int test_mm512_div_epu64();
+int test_mm512_div_round_ps();
+int test_mm512_div_round_pd();
+int test_mm_insert_epi32();
+int test_mm256_insert_epi32();
+int test_mm256_insert_epi64();
+int test_mm512_castpd128_pd512();
+int test_mm512_castpd512_pd128();
+int test_mm512_castps128_ps512();
+int test_mm512_castps512_ps128();
+int test_mm512_cvtepi32_pd();
+int test_mm512_cvtepi32_ps();
+int test_mm512_insertf32x8();
+int test_mm512_insertf64x4();
+int test_mm512_inserti32x8();
+int test_mm512_inserti64x4();
+int test_mm512_permutexvar_epi32();
+int test_mm256_cmp_pd();
+int test_mm256_cmp_ps();
+int test_mm512_cmp_pd_mask();
+int test_mm512_cmp_ps_mask();
+
+#endif
\ No newline at end of file
diff --git a/tests/main.c b/tests/main.c
new file mode 100644
index 0000000..8fb23af
--- /dev/null
+++ b/tests/main.c
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Huawei Technologies Co., Ltd. 2012-2018. All rights reserved.
+ * Description: avx2neon unit test main
+ * Author: xuqimeng
+ * Create: 2019-11-05
+*/
+
+#include <stdio.h>
+#include "a2ntest.h"
+
+int main()
+{
+    unsigned int i;
+    int passCount = 0;
+    int failCount = 0;
+    for (i = UT_MM_EXTRACT_EPI32; i <= UT_MM256_SET_EPI32; i++) {
+        int flag = 0;
+        const char *s = RunTest((InstructionTest)i, &flag);
+        printf("Running Test %s\n", s);
+        if (flag) {
+            passCount++;
+        } else {
+            printf("**FAILURE** AVX2NEONTest %s\n", s);
+            failCount++;
+        }
+    }
+    printf("AVX2NEONTest Complete: Passed %d tests : Failed %d\n", passCount, failCount);
+
+    return 0;
+}
diff --git a/typedefs.h b/typedefs.h
new file mode 100644
index 0000000..d00e46d
--- /dev/null
+++ b/typedefs.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2019. Huawei Technologies Co., Ltd. All rights reserved.
+
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+
+ * http://www.apache.org/licenses/LICENSE-2.0
+
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+
+ */
+
+#ifndef AVX2NEON_H
+#error " Never use <typedefs.h> directly; include " avx2neon.h" instead."
+#endif
+
+#ifndef TYPEDEFS_H
+#define TYPEDEFS_H
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+
+#else
+
+#error "Macro name collisions may happens with unknown compiler"
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+
+#endif
+
+#define likely(x)     __builtin_expect(!!(x), 1)
+#define unlikely(x)   __builtin_expect(!!(x), 0)
+
+#ifndef __int32
+#define __int32 int
+#endif
+
+#ifndef __int64
+#define __int64 long long
+#endif
+
+#ifndef __mmask64
+#define __mmask64 unsigned long long
+#endif
+
+#ifndef __mmask16
+#define __mmask16 unsigned short
+#endif
+
+#ifndef __mmask8
+#define __mmask8 unsigned char
+#endif
+
+#endif //TYPEDEFS_H
\ No newline at end of file