Blend混合SIMD小试
不太对,再更新下
#include <cstring>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <stdint.h>
#include <immintrin.h>
#include <smmintrin.h>
#include "thirdparty/libass/include/ass.h"
#include <chrono>
#if defined(_WIN32) || defined(_WIN64)
#define POPEN _popen
#define PCLOSE _pclose
const char *kOpenOption = "wb";
#else
#define POPEN popen
#define PCLOSE pclose
const char *kOpenOption = "w";
#endif
extern "C" {
int ass_process_events_line(ASS_Track *track, char *str);
}
ASS_Library *ass_library;
ASS_Renderer *ass_renderer;
typedef struct image_s {
int width, height, stride;
unsigned char *buffer; // RGB24
} image_t;
void msg_callback(int level, const char *fmt, va_list va, void *data) {
if (level > 6)
return;
printf("libass: ");
vprintf(fmt, va);
printf("\n");
}
#define TO_R(c) ((c) >> 24)
#define TO_G(c) (((c) >> 16) & 0xFF)
#define TO_B(c) (((c) >> 8) & 0xFF)
#define TO_A(c) ((c)&0xFF)
inline void blend_single(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned char r = TO_R(img->color);
unsigned char g = TO_G(img->color);
unsigned char b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w; ++x) {
uint32_t k = ((uint32_t)src[x]) * opacity;
// possible endianness problems...
// would anyone actually use big endian machine??
dst[x * 4] = (k * r + (255 * 255 - k) * dst[x * 4]) / (255 * 255);
dst[x * 4 + 1] = (k * g + (255 * 255 - k) * dst[x * 4 + 1]) / (255 * 255);
dst[x * 4 + 2] = (k * b + (255 * 255 - k) * dst[x * 4 + 2]) / (255 * 255);
dst[x * 4 + 3] = (k * 255 + (255 * 255 - k) * dst[x * 4 + 3]) / (255 * 255);
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single(frame, img);
++cnt;
img = img->next;
}
// printf("%d images blended\n", cnt);
}
inline int div_255_fast(int x) {
return (((x) + (((x) + 257) >> 8)) >> 8);
}
//#define div_255_fast(A) ((A) / 255)
inline void blend_single_normal_fast_div_single(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned char r = TO_R(img->color);
unsigned char g = TO_G(img->color);
unsigned char b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w; ++x) {
unsigned k = div_255_fast(((unsigned)src[x]) * opacity);
// possible endianness problems
dst[x * 4] = div_255_fast(k * r + (255 - k) * dst[x * 4]);
dst[x * 4 + 1] = div_255_fast(k * g + (255 - k) * dst[x * 4 + 1]);
dst[x * 4 + 2] = div_255_fast(k * b + (255 - k) * dst[x * 4 + 2]);
dst[x * 4 + 3] = div_255_fast(k * 255 + (255 - k) * dst[x * 4 + 3]);
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_normal_fast_div(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_normal_fast_div_single(frame, img);
++cnt;
img = img->next;
}
// printf("%d images blended\n", cnt);
}
#if 1
static inline __m128i _mm_fast_div_255_epu16(__m128i x) {
return _mm_srli_epi16(
_mm_adds_epu16(x, _mm_srli_epi16(_mm_adds_epu16(x, _mm_set1_epi16(0x0101)), 8)),
8);
}
void blend_single_u16_simd_method7(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
uint32_t rgba = _byteswap_ulong(img->color) | 0xFF;
unsigned char *src;
unsigned char *dst, *now_dst;
const int img_w_tmp = img->w;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x <= img_w_tmp - 2;) {
__m128i rgb_v = _mm_set1_epi32(rgba);
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
__m128i low_k = _mm_set1_epi16(src[x]);
__m128i high_k1 = _mm_set1_epi16(src[x + 1]);
__m128i k_v = _mm_unpackhi_epi64(low_k, high_k1);
high_k1 = _mm_set1_epi16(opacity);
k_v = _mm_mullo_epi16(k_v, high_k1);
k_v = _mm_fast_div_255_epu16(k_v);
high_k1 = _mm_mullo_epi16(k_v, rgb_v);
low_k = _mm_set1_epi16(255);
rgb_v = _mm_sub_epi16(low_k, k_v);
low_k = _mm_loadl_epi64((__m128i *)(now_dst));
low_k = _mm_unpacklo_epi8(low_k, zeros);
low_k = _mm_mullo_epi16(rgb_v, low_k);
low_k = _mm_add_epi16(high_k1, low_k);
low_k = _mm_fast_div_255_epu16(low_k);
low_k = _mm_packus_epi16(low_k, zeros);
_mm_storeu_si64(now_dst, low_k);
x += 2;
now_dst += 8;
}
if (x < img_w_tmp) {
__m128i rgb_v = _mm_set1_epi32(rgba);
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
__m128i low_k = _mm_set1_epi16(src[x]);
__m128i k_v = _mm_set1_epi16(opacity);
k_v = _mm_mullo_epi16(k_v, low_k);
k_v = _mm_fast_div_255_epu16(k_v);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
low_k = _mm_set1_epi16(255);
k_v = _mm_sub_epi16(low_k, k_v);
rgb_v = _mm_loadu_si32((__m128i *)(now_dst));
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
rgb_v = _mm_mullo_epi16(k_v, rgb_v);
rgb_v = _mm_add_epi16(mul_1, rgb_v);
rgb_v = _mm_fast_div_255_epu16(rgb_v);
rgb_v = _mm_packs_epi16(rgb_v, zeros);
_mm_storeu_si32(now_dst, rgb_v);
}
src += img->stride;
dst += frame->stride;
}
}
void blend_u16_simd7(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method7(frame, img);
++cnt;
img = img->next;
}
}
void blend_single_u16_simd_method6(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
uint32_t rgba = _byteswap_ulong(img->color) | 0xFF;
unsigned char *src;
unsigned char *dst, *now_dst;
const int img_w_tmp = img->w;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x <= img_w_tmp - 2;) {
__m128i rgb_v = _mm_set1_epi32(rgba);
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
uint16_t k = ((unsigned int)src[x]) * opacity;
uint16_t k1 = ((unsigned int)src[x + 1]) * opacity;
__m128i low_k = _mm_set1_epi16(k);
__m128i high_k1 = _mm_set1_epi16(k1);
__m128i k_v = _mm_unpackhi_epi64(low_k, high_k1);
high_k1 = _mm_mullo_epi16(k_v, rgb_v);
low_k = _mm_set1_epi16(255);
k_v = _mm_sub_epi16(low_k, k_v);
low_k = _mm_loadl_epi64((__m128i *)(now_dst));
low_k = _mm_unpacklo_epi8(low_k, zeros);
low_k = _mm_mullo_epi16(k_v, low_k);
low_k = _mm_add_epi16(high_k1, low_k);
low_k = _mm_fast_div_255_epu16(low_k);
low_k = _mm_packus_epi16(low_k, zeros);
_mm_storeu_si64(now_dst, low_k);
x += 2;
now_dst += 8;
}
if (x < img_w_tmp) {
__m128i rgb_v = _mm_set1_epi32(rgba);
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
uint16_t k = ((unsigned int)src[x]) * opacity;
__m128i k_v = _mm_set1_epi16(k);
rgb_v = _mm_mullo_epi16(k_v, rgb_v);
__m128i sub_max = _mm_set1_epi16(255);
sub_max = _mm_sub_epi16(sub_max, k_v);
k_v = _mm_loadu_si32((__m128i *)(now_dst));
k_v = _mm_unpacklo_epi8(k_v, zeros);
k_v = _mm_mullo_epi16(sub_max, k_v);
k_v = _mm_add_epi16(rgb_v, k_v);
k_v = _mm_fast_div_255_epu16(k_v);
k_v = _mm_packus_epi16(k_v, zeros);
_mm_storeu_si32(now_dst, k_v);
}
src += img->stride;
dst += frame->stride;
}
}
void blend_u16_simd6(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method6(frame, img);
++cnt;
img = img->next;
}
}
void blend_single_u16_simd_method5(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
uint32_t rgba = _byteswap_ulong(img->color) | 0xFF;
unsigned char *src;
unsigned char *dst, *now_dst;
const int img_w_tmp = img->w;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x <= img_w_tmp - 2;) {
__m128i rgb_v = _mm_set1_epi32(rgba);
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
rgb_v = _mm_unpackhi_epi64(rgb_v, rgb_v);
uint16_t k = div_255_fast(((unsigned int)src[x]) * opacity);
uint16_t k1 = div_255_fast(((unsigned int)src[x + 1]) * opacity);
__m128i low_k = _mm_set1_epi16(k);
__m128i high_k1 = _mm_set1_epi16(k1);
__m128i k_v = _mm_unpackhi_epi64(low_k, high_k1);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
__m128i sub_max = _mm_set1_epi16(255);
__m128i k_v2 = _mm_sub_epi16(sub_max, k_v);
__m128i dst_v = _mm_loadl_epi64((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si64(now_dst, res2);
x += 2;
now_dst += 8;
}
if (x < img_w_tmp) {
__m128i rgb_v = _mm_set1_epi32(rgba);
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
rgb_v = _mm_unpackhi_epi64(rgb_v, rgb_v);
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i low_k = _mm_set1_epi16(k);
__m128i k_v = _mm_unpackhi_epi64(low_k, low_k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
__m128i sub_max = _mm_set1_epi16(255);
__m128i k_v2 = _mm_sub_epi16(sub_max, k_v);
__m128i dst_v = _mm_loadu_si32((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si32(now_dst, res2);
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd5(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method5(frame, img);
++cnt;
img = img->next;
}
}
inline void blend_single_u16_simd_method4(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned short r = TO_R(img->color);
unsigned short g = TO_G(img->color);
unsigned short b = TO_B(img->color);
unsigned char *src;
unsigned char *dst, *now_dst;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x < img->w;) {
__m128i rgb_v = _mm_set_epi16(255, b, g, r, 255, b, g, r);
if (1) [[likely]] {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
uint16_t k1 = ((unsigned int)src[x + 1]) * opacity / 255;
__m128i low_k = _mm_set1_epi16(k);
__m128i high_k1 = _mm_set1_epi16(k1);
__m128i k_v = _mm_unpackhi_epi64(low_k, high_k1);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
__m128i sub_max = _mm_set1_epi16(255);
__m128i k_v2 = _mm_sub_epi16(sub_max, k_v);
__m128i dst_v = _mm_loadl_epi64((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si64(now_dst, res2);
x += 2;
now_dst += 8;
} else {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i k_v = _mm_set1_epi16(k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
__m128i k_v2 = _mm_set1_epi16(k);
__m128i dst_v = _mm_set_epi16(0, 0, 0, 0, dst[x * 4 + 3], dst[x * 4 + 2],
dst[x * 4 + 1], dst[x * 4]);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
__m128i packed;
_mm_store_si128(&packed, res2);
dst[x * 4] = packed.m128i_u8[0];
dst[x * 4 + 1] = packed.m128i_u8[2];
dst[x * 4 + 2] = packed.m128i_u8[4];
dst[x * 4 + 3] = packed.m128i_u8[6];
x++;
now_dst += 4;
}
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd4(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method4(frame, img);
++cnt;
img = img->next;
}
}
inline void blend_single_u16_simd_method3(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned short r = TO_R(img->color);
unsigned short g = TO_G(img->color);
unsigned short b = TO_B(img->color);
unsigned char *src;
unsigned char *dst, *now_dst;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x < img->w;) {
__m128i rgb_v = _mm_set_epi16(255, b, g, r, 255, b, g, r);
if (1) [[likely]] {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
uint16_t k1 = ((unsigned int)src[x + 1]) * opacity / 255;
__m128i low_k = _mm_set1_epi16(k);
__m128i high_k1 = _mm_set1_epi16(k1);
__m128i k_v = _mm_unpackhi_epi64(low_k, high_k1);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
k1 = 255 - k1;
low_k = _mm_set1_epi16(k);
high_k1 = _mm_set1_epi16(k1);
__m128i k_v2 = _mm_unpackhi_epi64(low_k, high_k1);
__m128i dst_v = _mm_loadl_epi64((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si64(now_dst, res2);
x += 2;
now_dst += 8;
} else {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i k_v = _mm_set1_epi16(k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
__m128i k_v2 = _mm_set1_epi16(k);
__m128i dst_v = _mm_set_epi16(0, 0, 0, 0, dst[x * 4 + 3], dst[x * 4 + 2],
dst[x * 4 + 1], dst[x * 4]);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
__m128i packed;
_mm_store_si128(&packed, res2);
dst[x * 4] = packed.m128i_u8[0];
dst[x * 4 + 1] = packed.m128i_u8[2];
dst[x * 4 + 2] = packed.m128i_u8[4];
dst[x * 4 + 3] = packed.m128i_u8[6];
x++;
now_dst += 4;
}
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd3(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method3(frame, img);
++cnt;
img = img->next;
}
}
inline void blend_single_u16_simd_method2(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned short r = TO_R(img->color);
unsigned short g = TO_G(img->color);
unsigned short b = TO_B(img->color);
unsigned char *src;
unsigned char *dst, *now_dst;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x < img->w;) {
__m128i rgb_v = _mm_set_epi16(255, b, g, r, 255, b, g, r);
if (1) [[likely]] {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
uint16_t k1 = ((unsigned int)src[x + 1]) * opacity / 255;
__m128i k_v = _mm_set_epi16(k1, k1, k1, k1, k, k, k, k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
k1 = 255 - k1;
__m128i k_v2 = _mm_set_epi16(k1, k1, k1, k1, k, k, k, k);
__m128i dst_v = _mm_loadl_epi64((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si64(now_dst, res2);
x += 2;
now_dst += 8;
} else {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i k_v = _mm_set1_epi16(k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
__m128i k_v2 = _mm_set1_epi16(k);
__m128i dst_v = _mm_set_epi16(0, 0, 0, 0, dst[x * 4 + 3], dst[x * 4 + 2],
dst[x * 4 + 1], dst[x * 4]);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
__m128i packed;
_mm_store_si128(&packed, res2);
dst[x * 4] = packed.m128i_u8[0];
dst[x * 4 + 1] = packed.m128i_u8[2];
dst[x * 4 + 2] = packed.m128i_u8[4];
dst[x * 4 + 3] = packed.m128i_u8[6];
x++;
now_dst += 4;
}
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd2(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method2(frame, img);
++cnt;
img = img->next;
}
// printf("%d images blended\n", cnt);
}
inline void blend_single_u16_simd(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned short r = TO_R(img->color);
unsigned short g = TO_G(img->color);
unsigned short b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w;) {
__m128i rgb_v = _mm_set_epi16(255, b, g, r, 255, b, g, r);
if (img->w - x > 1) [[likely]] {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
uint16_t k1 = ((unsigned int)src[x + 1]) * opacity / 255;
__m128i k_v = _mm_set_epi16(k1, k1, k1, k1, k, k, k, k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
k1 = 255 - k1;
__m128i k_v2 = _mm_set_epi16(k1, k1, k1, k1, k, k, k, k);
__m128i dst_v = _mm_set_epi16(
dst[x * 4 + 7], dst[x * 4 + 6], dst[x * 4 + 5], dst[x * 4 + 4],
dst[x * 4 + 3], dst[x * 4 + 2], dst[x * 4 + 1], dst[x * 4]);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
__m128i packed;
_mm_store_si128(&packed, res2);
dst[x * 4] = packed.m128i_u8[0];
dst[x * 4 + 1] = packed.m128i_u8[2];
dst[x * 4 + 2] = packed.m128i_u8[4];
dst[x * 4 + 3] = packed.m128i_u8[6];
dst[x * 4 + 4] = packed.m128i_u8[8];
dst[x * 4 + 5] = packed.m128i_u8[10];
dst[x * 4 + 6] = packed.m128i_u8[12];
dst[x * 4 + 7] = packed.m128i_u8[14];
x += 2;
} else {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i k_v = _mm_set1_epi16(k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
__m128i k_v2 = _mm_set1_epi16(k);
__m128i dst_v = _mm_set_epi16(0, 0, 0, 0, dst[x * 4 + 3], dst[x * 4 + 2],
dst[x * 4 + 1], dst[x * 4]);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
__m128i packed;
_mm_store_si128(&packed, res2);
dst[x * 4] = packed.m128i_u8[0];
dst[x * 4 + 1] = packed.m128i_u8[2];
dst[x * 4 + 2] = packed.m128i_u8[4];
dst[x * 4 + 3] = packed.m128i_u8[6];
x++;
}
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd(frame, img);
++cnt;
img = img->next;
}
// printf("%d images blended\n", cnt);
}
#endif
inline void blend_single_normal_no_div(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned char r = TO_R(img->color);
unsigned char g = TO_G(img->color);
unsigned char b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w; ++x) {
uint32_t k = ((uint32_t)src[x]) * opacity;
// possible endianness problems...
// would anyone actually use big endian machine??
dst[x * 4] = (k * r + (255 * 255 - k) * dst[x * 4]) >> 16;
dst[x * 4 + 1] = (k * g + (255 * 255 - k) * dst[x * 4 + 1]) >> 16;
dst[x * 4 + 2] = (k * b + (255 * 255 - k) * dst[x * 4 + 2]) >> 16;
dst[x * 4 + 3] = (k * 255 + (255 * 255 - k) * dst[x * 4 + 3]) >> 16;
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_normal_no_div(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_normal_no_div(frame, img);
++cnt;
img = img->next;
}
// printf("%d images blended\n", cnt);
}
inline void blend_single_simd_no_div(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned char r = TO_R(img->color);
unsigned char g = TO_G(img->color);
unsigned char b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
// dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
dst = (frame->buffer) + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w; x++) {
uint32_t k = ((uint32_t)src[x]) * opacity;
uint32_t kk = 65025 - k;
uint8_t *dst_p = (uint8_t *)&dst[x << 2];
uint32_t dst_u32;
__m128 scale_dst = _mm_set1_ps((float)kk);
__m128i dst_32i = _mm_set_epi32(dst_p[3], dst_p[2], dst_p[1], dst_p[0]);
__m128 dst_ps = _mm_cvtepi32_ps(dst_32i);
__m128 scale_dst_res_ps = _mm_mul_ps(dst_ps, scale_dst);
__m128i scale_dst_res0_32i = _mm_cvtps_epi32(scale_dst_res_ps);
scale_dst = _mm_set1_ps((float)k);
dst_32i = _mm_set_epi32(255, b, g, r);
dst_ps = _mm_cvtepi32_ps(dst_32i);
scale_dst_res_ps = _mm_mul_ps(dst_ps, scale_dst);
__m128i scale_dst_res1_32i = _mm_cvtps_epi32(scale_dst_res_ps);
dst_32i = _mm_add_epi32(scale_dst_res0_32i, scale_dst_res1_32i);
__m128i res = _mm_srli_epi32(dst_32i, 16);
__m128i packed;
_mm_store_si128(&packed, res);
dst_p[0] = packed.m128i_u8[0];
dst_p[1] = packed.m128i_u8[4];
dst_p[2] = packed.m128i_u8[8];
dst_p[3] = packed.m128i_u8[12];
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_simd_no_div(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_simd_no_div(frame, img);
++cnt;
img = img->next;
}
}
inline void blend_single_intel_div(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned char r = TO_R(img->color);
unsigned char g = TO_G(img->color);
unsigned char b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
// dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
dst = (frame->buffer) + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w; x++) {
uint32_t k = ((uint32_t)src[x]) * opacity;
uint32_t kk = 65025 - k;
uint8_t *dst_p = (uint8_t *)&dst[x << 2];
uint32_t dst_u32;
__m128 scale_dst = _mm_set1_ps((float)kk);
__m128i dst_32i = _mm_set_epi32(dst_p[3], dst_p[2], dst_p[1], dst_p[0]);
__m128 dst_ps = _mm_cvtepi32_ps(dst_32i);
__m128 scale_dst_res_ps = _mm_mul_ps(dst_ps, scale_dst);
__m128i scale_dst_res0_32i = _mm_cvtps_epi32(scale_dst_res_ps);
scale_dst = _mm_set1_ps((float)k);
dst_32i = _mm_set_epi32(255, b, g, r);
dst_ps = _mm_cvtepi32_ps(dst_32i);
scale_dst_res_ps = _mm_mul_ps(dst_ps, scale_dst);
__m128i scale_dst_res1_32i = _mm_cvtps_epi32(scale_dst_res_ps);
dst_32i = _mm_add_epi32(scale_dst_res0_32i, scale_dst_res1_32i);
__m128i div_num = _mm_set1_epi32(65025);
__m128i res = _mm_div_epi32(dst_32i, div_num);
__m128i packed;
_mm_store_si128(&packed, res);
dst_p[0] = packed.m128i_u8[0];
dst_p[1] = packed.m128i_u8[4];
dst_p[2] = packed.m128i_u8[8];
dst_p[3] = packed.m128i_u8[12];
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_simd_intel_div(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_intel_div(frame, img);
++cnt;
img = img->next;
}
}
inline void blend_single_fast_div(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned char r = TO_R(img->color);
unsigned char g = TO_G(img->color);
unsigned char b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
// dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
dst = (frame->buffer) + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w; x++) {
uint32_t k = ((uint32_t)src[x]) * opacity;
uint32_t kk = 65025 - k;
uint8_t *dst_p = (uint8_t *)&dst[x << 2];
uint32_t dst_u32;
__m128 scale_dst = _mm_set1_ps((float)kk);
__m128i dst_32i = _mm_set_epi32(dst_p[3], dst_p[2], dst_p[1], dst_p[0]);
__m128 dst_ps = _mm_cvtepi32_ps(dst_32i);
__m128 scale_dst_res_ps = _mm_mul_ps(dst_ps, scale_dst);
__m128i scale_dst_res0_32i = _mm_cvtps_epi32(scale_dst_res_ps);
scale_dst = _mm_set1_ps((float)k);
dst_32i = _mm_set_epi32(255, b, g, r);
dst_ps = _mm_cvtepi32_ps(dst_32i);
scale_dst_res_ps = _mm_mul_ps(dst_ps, scale_dst);
__m128i scale_dst_res1_32i = _mm_cvtps_epi32(scale_dst_res_ps);
dst_32i = _mm_add_epi32(scale_dst_res0_32i, scale_dst_res1_32i);
#if 1
__m128i mult = _mm_set_epi64x(0x0203040602030406, 0x0203040602030406);
__m128i s1 = _mm_set_epi64x(0x0000000000000000, 0x0000000000000001);
__m128i s2 = _mm_set_epi64x(0x0000000000000000, 0x000000000000000f);
__m128i t1 = _mm_mul_epu32(
dst_32i, mult); // 32x32->64 bit unsigned multiplication of a[0] and a[2]
__m128i t2 = _mm_srli_epi64(t1, 32); // high dword of result 0 and 2
__m128i t3 = _mm_srli_epi64(
dst_32i, 32); // get a[1] and a[3] into position for multiplication
__m128i t4 = _mm_mul_epu32(
t3, mult); // 32x32->64 bit unsigned multiplication of a[1] and a[3]
#if 1 // SSE4.1 supported
__m128i t7 = _mm_blend_epi16(t2, t4, 0xCC); // blend two results
#else
__m128i t5 = _mm_set_epi32(-1, 0, -1, 0); // mask of dword 1 and 3
__m128i t6 = _mm_and_si128(t4, t5); // high dword of result 1 and 3
__m128i t7 = _mm_or_si128(t2, t6); // combine all four results into one vector
#endif
__m128i t8 = _mm_sub_epi32(dst_32i, t7); // subtract
__m128i t9 = _mm_srl_epi32(t8, s1); // shift right logical
__m128i t10 = _mm_add_epi32(t7, t9); // add
__m128i res = _mm_srl_epi32(t10, s2); // shift right logical
#endif
__m128i packed;
_mm_store_si128(&packed, res);
dst_p[0] = packed.m128i_u8[0];
dst_p[1] = packed.m128i_u8[4];
dst_p[2] = packed.m128i_u8[8];
dst_p[3] = packed.m128i_u8[12];
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_simd_fast_div(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_fast_div(frame, img);
++cnt;
img = img->next;
}
}
inline image_t *gen_image(int width, int height) {
image_t *img = (image_t *)malloc(sizeof(image_t));
img->width = width;
img->height = height;
img->stride = width * 4;
img->buffer = (unsigned char *)calloc(1, height * width * 4);
memset(img->buffer, 63, img->stride * img->height);
// for (int i = 0; i < height * width * 3; ++i)
// img->buffer[i] = (i/3/50) % 100;
return img;
}
inline void init(int frame_w, int frame_h) {
ass_library = ass_library_init();
if (!ass_library) {
printf("ass_library_init failed!\n");
exit(1);
}
ass_set_message_cb(ass_library, msg_callback, NULL);
ass_set_extract_fonts(ass_library, 1);
ass_renderer = ass_renderer_init(ass_library);
if (!ass_renderer) {
printf("ass_renderer_init failed!\n");
exit(1);
}
ass_set_frame_size(ass_renderer, frame_w, frame_h);
ass_set_fonts(ass_renderer, NULL, "sans-serif", ASS_FONTPROVIDER_AUTODETECT, NULL, 1);
}
int main() {
init(1920, 1080);
ASS_Track *track = ass_read_file(ass_library, (char *)"D:/a1.ass", NULL);
double cost_time;
int tm = 0;
image_t *frame = gen_image(1920, 1080);
// ass_set_cache_limits(ass_renderer, 0, 50);
tm = 0;
cost_time = 0;
#define TIME_LIMIT (250 * 15 * 20)
while (tm < TIME_LIMIT) {
ass_render_frame(ass_renderer, track, tm, NULL);
tm += ((double)(1000) / (double)(60));
}
std::cout << "start!" << std::endl;
std::cout << "-1-7] simd uint16 method7 div: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend_u16_simd7(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "-1-6] simd uint16 method6 div: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend_u16_simd6(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "-1-5] simd uint16 method5 div: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend_u16_simd5(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "-1-4] simd uint16 method4 div: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend_u16_simd4(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "-1-3] simd uint16 method3 div: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend_u16_simd3(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "-1-2] simd uint16 method2 div: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend_u16_simd2(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "-1] simd uint16 div: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend_u16_simd(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "0] normal no div: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend_normal_no_div(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "1] normal method: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "3] fast div: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend_simd_fast_div(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "4] no div: ";
tm = 0;
cost_time = 0;
while (tm < TIME_LIMIT) {
ASS_Image *img = ass_render_frame(ass_renderer, track, tm, NULL);
// clear buffer
memset(frame->buffer, 0, 1920 * 1080 * 4);
auto job_start_time = std::chrono::high_resolution_clock::now();
blend_simd_no_div(frame, img);
auto job_end_time = std::chrono::high_resolution_clock::now();
cost_time +=
std::chrono::duration<double, std::milli>(job_end_time - job_start_time)
.count();
tm += ((double)(1000) / (double)(60));
}
std::cout << cost_time << " ms" << std::endl;
std::cout << "Done!\n";
return 0;
}
改进版本:
#include <cstring>
#include <iostream>
#include <stdio.h>
#include <stdlib.h>
#include <string>
#include <stdint.h>
#include <immintrin.h>
#include <smmintrin.h>
#include "thirdparty/libass/include/ass.h"
#include <chrono>
#if defined(_WIN32) || defined(_WIN64)
#define POPEN _popen
#define PCLOSE _pclose
const char *kOpenOption = "wb";
#else
#define POPEN popen
#define PCLOSE pclose
const char *kOpenOption = "w";
#endif
extern "C" {
int ass_process_events_line(ASS_Track *track, char *str);
}
ASS_Library *ass_library;
ASS_Renderer *ass_renderer;
typedef struct image_s {
int width, height, stride;
unsigned char *buffer; // RGB24
} image_t;
void msg_callback(int level, const char *fmt, va_list va, void *data) {
if (level > 6)
return;
printf("libass: ");
vprintf(fmt, va);
printf("\n");
}
#define TO_R(c) ((c) >> 24)
#define TO_G(c) (((c) >> 16) & 0xFF)
#define TO_B(c) (((c) >> 8) & 0xFF)
#define TO_A(c) ((c)&0xFF)
inline void blend_single(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned char r = TO_R(img->color);
unsigned char g = TO_G(img->color);
unsigned char b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w; ++x) {
uint32_t k = ((uint32_t)src[x]) * opacity;
// possible endianness problems...
// would anyone actually use big endian machine??
dst[x * 4] = (k * r + (255 * 255 - k) * dst[x * 4]) / (255 * 255);
dst[x * 4 + 1] = (k * g + (255 * 255 - k) * dst[x * 4 + 1]) / (255 * 255);
dst[x * 4 + 2] = (k * b + (255 * 255 - k) * dst[x * 4 + 2]) / (255 * 255);
dst[x * 4 + 3] = (k * 255 + (255 * 255 - k) * dst[x * 4 + 3]) / (255 * 255);
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single(frame, img);
++cnt;
img = img->next;
}
// printf("%d images blended\n", cnt);
}
inline int div_255_fast(int x) { return (((x) + (((x) + 257) >> 8)) >> 8); }
//#define div_255_fast(A) ((A) / 255)
inline void blend_single_normal_fast_div_single(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned char r = TO_R(img->color);
unsigned char g = TO_G(img->color);
unsigned char b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w; ++x) {
unsigned k = div_255_fast(((unsigned)src[x]) * opacity);
// possible endianness problems
dst[x * 4] = div_255_fast(k * r + (255 - k) * dst[x * 4]);
dst[x * 4 + 1] = div_255_fast(k * g + (255 - k) * dst[x * 4 + 1]);
dst[x * 4 + 2] = div_255_fast(k * b + (255 - k) * dst[x * 4 + 2]);
dst[x * 4 + 3] = div_255_fast(k * 255 + (255 - k) * dst[x * 4 + 3]);
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_normal_fast_div(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_normal_fast_div_single(frame, img);
++cnt;
img = img->next;
}
// printf("%d images blended\n", cnt);
}
#if 1
static inline __m128i _mm_fast_div_255_epu16(__m128i x) {
return _mm_srli_epi16(
_mm_adds_epu16(x, _mm_srli_epi16(_mm_adds_epu16(x, _mm_set1_epi16(0x0101)), 8)),
8);
}
inline void blend_single_u16_simd_method6(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
uint32_t rgba = _byteswap_ulong(img->color) | 0xFF;
unsigned char *src;
unsigned char *dst, *now_dst;
const int img_w_tmp = img->w;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x <= img_w_tmp - 2;) {
__m128i rgb_v = _mm_set1_epi32(rgba);
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
rgb_v = _mm_unpackhi_epi64(rgb_v, rgb_v);
uint16_t k = div_255_fast(((unsigned int)src[x]) * opacity);
uint16_t k1 = div_255_fast(((unsigned int)src[x + 1]) * opacity);
__m128i low_k = _mm_set1_epi16(k);
__m128i high_k1 = _mm_set1_epi16(k1);
__m128i k_v = _mm_unpackhi_epi64(low_k, high_k1);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
__m128i sub_max = _mm_set1_epi16(255);
__m128i k_v2 = _mm_sub_epi16(sub_max, k_v);
__m128i dst_v = _mm_loadl_epi64((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
res1 = _mm_fast_div_255_epu16(res1);
res1 = _mm_packus_epi16(res1, zeros);
_mm_storeu_si64(now_dst, res1);
x += 2;
now_dst += 8;
}
if (x < img_w_tmp) {
__m128i rgb_v = _mm_set1_epi32(rgba);
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
rgb_v = _mm_unpackhi_epi64(rgb_v, rgb_v);
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i low_k = _mm_set1_epi16(k);
__m128i k_v = _mm_unpackhi_epi64(low_k, low_k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
__m128i sub_max = _mm_set1_epi16(255);
__m128i k_v2 = _mm_sub_epi16(sub_max, k_v);
__m128i dst_v = _mm_loadu_si32((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si32(now_dst, res2);
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd6(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method6(frame, img);
++cnt;
img = img->next;
}
}
inline void blend_single_u16_simd_method5(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
uint32_t rgba = _byteswap_ulong(img->color) | 0xFF;
unsigned char *src;
unsigned char *dst, *now_dst;
const int img_w_tmp = img->w;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x <= img_w_tmp - 2;) {
__m128i rgb_v = _mm_set1_epi32(rgba);
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
rgb_v = _mm_unpackhi_epi64(rgb_v, rgb_v);
uint16_t k = div_255_fast(((unsigned int)src[x]) * opacity);
uint16_t k1 = div_255_fast(((unsigned int)src[x + 1]) * opacity);
__m128i low_k = _mm_set1_epi16(k);
__m128i high_k1 = _mm_set1_epi16(k1);
__m128i k_v = _mm_unpackhi_epi64(low_k, high_k1);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
__m128i sub_max = _mm_set1_epi16(255);
__m128i k_v2 = _mm_sub_epi16(sub_max, k_v);
__m128i dst_v = _mm_loadl_epi64((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si64(now_dst, res2);
x += 2;
now_dst += 8;
}
if (x < img_w_tmp) {
__m128i rgb_v = _mm_set1_epi32(rgba);
rgb_v = _mm_unpacklo_epi8(rgb_v, zeros);
rgb_v = _mm_unpackhi_epi64(rgb_v, rgb_v);
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i low_k = _mm_set1_epi16(k);
__m128i k_v = _mm_unpackhi_epi64(low_k, low_k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
__m128i sub_max = _mm_set1_epi16(255);
__m128i k_v2 = _mm_sub_epi16(sub_max, k_v);
__m128i dst_v = _mm_loadu_si32((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si32(now_dst, res2);
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd5(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method5(frame, img);
++cnt;
img = img->next;
}
}
inline void blend_single_u16_simd_method4(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned short r = TO_R(img->color);
unsigned short g = TO_G(img->color);
unsigned short b = TO_B(img->color);
unsigned char *src;
unsigned char *dst, *now_dst;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x < img->w;) {
__m128i rgb_v = _mm_set_epi16(255, b, g, r, 255, b, g, r);
if (1)
[[likely]] {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
uint16_t k1 = ((unsigned int)src[x + 1]) * opacity / 255;
__m128i low_k = _mm_set1_epi16(k);
__m128i high_k1 = _mm_set1_epi16(k1);
__m128i k_v = _mm_unpackhi_epi64(low_k, high_k1);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
__m128i sub_max = _mm_set1_epi16(255);
__m128i k_v2 = _mm_sub_epi16(sub_max, k_v);
__m128i dst_v = _mm_loadl_epi64((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si64(now_dst, res2);
x += 2;
now_dst += 8;
}
else {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i k_v = _mm_set1_epi16(k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
__m128i k_v2 = _mm_set1_epi16(k);
__m128i dst_v = _mm_set_epi16(0, 0, 0, 0, dst[x * 4 + 3], dst[x * 4 + 2],
dst[x * 4 + 1], dst[x * 4]);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
__m128i packed;
_mm_store_si128(&packed, res2);
dst[x * 4] = packed.m128i_u8[0];
dst[x * 4 + 1] = packed.m128i_u8[2];
dst[x * 4 + 2] = packed.m128i_u8[4];
dst[x * 4 + 3] = packed.m128i_u8[6];
x++;
now_dst += 4;
}
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd4(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method4(frame, img);
++cnt;
img = img->next;
}
}
inline void blend_single_u16_simd_method3(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned short r = TO_R(img->color);
unsigned short g = TO_G(img->color);
unsigned short b = TO_B(img->color);
unsigned char *src;
unsigned char *dst, *now_dst;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x < img->w;) {
__m128i rgb_v = _mm_set_epi16(255, b, g, r, 255, b, g, r);
if (1)
[[likely]] {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
uint16_t k1 = ((unsigned int)src[x + 1]) * opacity / 255;
__m128i low_k = _mm_set1_epi16(k);
__m128i high_k1 = _mm_set1_epi16(k1);
__m128i k_v = _mm_unpackhi_epi64(low_k, high_k1);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
k1 = 255 - k1;
low_k = _mm_set1_epi16(k);
high_k1 = _mm_set1_epi16(k1);
__m128i k_v2 = _mm_unpackhi_epi64(low_k, high_k1);
__m128i dst_v = _mm_loadl_epi64((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si64(now_dst, res2);
x += 2;
now_dst += 8;
}
else {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i k_v = _mm_set1_epi16(k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
__m128i k_v2 = _mm_set1_epi16(k);
__m128i dst_v = _mm_set_epi16(0, 0, 0, 0, dst[x * 4 + 3], dst[x * 4 + 2],
dst[x * 4 + 1], dst[x * 4]);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
__m128i packed;
_mm_store_si128(&packed, res2);
dst[x * 4] = packed.m128i_u8[0];
dst[x * 4 + 1] = packed.m128i_u8[2];
dst[x * 4 + 2] = packed.m128i_u8[4];
dst[x * 4 + 3] = packed.m128i_u8[6];
x++;
now_dst += 4;
}
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd3(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method3(frame, img);
++cnt;
img = img->next;
}
}
inline void blend_single_u16_simd_method2(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned short r = TO_R(img->color);
unsigned short g = TO_G(img->color);
unsigned short b = TO_B(img->color);
unsigned char *src;
unsigned char *dst, *now_dst;
__m128i zeros = _mm_setzero_si128();
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0, now_dst = dst; x < img->w;) {
__m128i rgb_v = _mm_set_epi16(255, b, g, r, 255, b, g, r);
if (1)
[[likely]] {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
uint16_t k1 = ((unsigned int)src[x + 1]) * opacity / 255;
__m128i k_v = _mm_set_epi16(k1, k1, k1, k1, k, k, k, k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
k1 = 255 - k1;
__m128i k_v2 = _mm_set_epi16(k1, k1, k1, k1, k, k, k, k);
__m128i dst_v = _mm_loadl_epi64((__m128i *)(now_dst));
dst_v = _mm_unpacklo_epi8(dst_v, zeros);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
res2 = _mm_packus_epi16(res2, zeros);
_mm_storeu_si64(now_dst, res2);
x += 2;
now_dst += 8;
}
else {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i k_v = _mm_set1_epi16(k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
__m128i k_v2 = _mm_set1_epi16(k);
__m128i dst_v = _mm_set_epi16(0, 0, 0, 0, dst[x * 4 + 3], dst[x * 4 + 2],
dst[x * 4 + 1], dst[x * 4]);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
__m128i packed;
_mm_store_si128(&packed, res2);
dst[x * 4] = packed.m128i_u8[0];
dst[x * 4 + 1] = packed.m128i_u8[2];
dst[x * 4 + 2] = packed.m128i_u8[4];
dst[x * 4 + 3] = packed.m128i_u8[6];
x++;
now_dst += 4;
}
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd2(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd_method2(frame, img);
++cnt;
img = img->next;
}
// printf("%d images blended\n", cnt);
}
inline void blend_single_u16_simd(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned short r = TO_R(img->color);
unsigned short g = TO_G(img->color);
unsigned short b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w;) {
__m128i rgb_v = _mm_set_epi16(255, b, g, r, 255, b, g, r);
if (img->w - x > 1)
[[likely]] {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
uint16_t k1 = ((unsigned int)src[x + 1]) * opacity / 255;
__m128i k_v = _mm_set_epi16(k1, k1, k1, k1, k, k, k, k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
k1 = 255 - k1;
__m128i k_v2 = _mm_set_epi16(k1, k1, k1, k1, k, k, k, k);
__m128i dst_v = _mm_set_epi16(
dst[x * 4 + 7], dst[x * 4 + 6], dst[x * 4 + 5], dst[x * 4 + 4],
dst[x * 4 + 3], dst[x * 4 + 2], dst[x * 4 + 1], dst[x * 4]);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
__m128i packed;
_mm_store_si128(&packed, res2);
dst[x * 4] = packed.m128i_u8[0];
dst[x * 4 + 1] = packed.m128i_u8[2];
dst[x * 4 + 2] = packed.m128i_u8[4];
dst[x * 4 + 3] = packed.m128i_u8[6];
dst[x * 4 + 4] = packed.m128i_u8[8];
dst[x * 4 + 5] = packed.m128i_u8[10];
dst[x * 4 + 6] = packed.m128i_u8[12];
dst[x * 4 + 7] = packed.m128i_u8[14];
x += 2;
}
else {
uint16_t k = ((unsigned int)src[x]) * opacity / 255;
__m128i k_v = _mm_set1_epi16(k);
__m128i mul_1 = _mm_mullo_epi16(k_v, rgb_v);
k = 255 - k;
__m128i k_v2 = _mm_set1_epi16(k);
__m128i dst_v = _mm_set_epi16(0, 0, 0, 0, dst[x * 4 + 3], dst[x * 4 + 2],
dst[x * 4 + 1], dst[x * 4]);
__m128i mul_2 = _mm_mullo_epi16(k_v2, dst_v);
__m128i res1 = _mm_add_epi16(mul_1, mul_2);
__m128i res2 = _mm_fast_div_255_epu16(res1);
__m128i packed;
_mm_store_si128(&packed, res2);
dst[x * 4] = packed.m128i_u8[0];
dst[x * 4 + 1] = packed.m128i_u8[2];
dst[x * 4 + 2] = packed.m128i_u8[4];
dst[x * 4 + 3] = packed.m128i_u8[6];
x++;
}
}
src += img->stride;
dst += frame->stride;
}
}
inline void blend_u16_simd(image_t *frame, ASS_Image *img) {
int cnt = 0;
while (img) {
blend_single_u16_simd(frame, img);
++cnt;
img = img->next;
}
// printf("%d images blended\n", cnt);
}
#endif
inline void blend_single_normal_no_div(image_t *frame, ASS_Image *img) {
int x, y;
unsigned char opacity = 255 - TO_A(img->color);
unsigned char r = TO_R(img->color);
unsigned char g = TO_G(img->color);
unsigned char b = TO_B(img->color);
unsigned char *src;
unsigned char *dst;
src = img->bitmap;
dst = frame->buffer + img->dst_y * frame->stride + img->dst_x * 4;
for (y = 0; y < img->h; ++y) {
for (x = 0; x < img->w; ++x) {
uint32_t k = ((uint32_t)src[x]) * opacity;
// possible endianness problems...
// would anyone actually use big endian machine??