4759 lines
191 KiB
C
Executable file
4759 lines
191 KiB
C
Executable file
/*
|
|
|
|
# tiny_webp.h
|
|
|
|
Reasonably tiny, single-header WebP library in C99.
|
|
|
|
## Examples
|
|
|
|
The API is explained in more detail below, but here are some quick examples:
|
|
|
|
Load RGB/RGBA:
|
|
|
|
// If you already have the file in memory, use twp_read_from_memory() instead.
|
|
int width, height;
|
|
unsigned char *data = twp_read("path/to/file.webp",
|
|
&width, &height,
|
|
twp_FORMAT_RGBA, // or twp_FORMAT_RGB
|
|
0); // no flags
|
|
if (data) { } // Do something with the data
|
|
free(data);
|
|
|
|
Get information about a .webp file:
|
|
|
|
// If you already have the file in memory, use twp_get_info_from_memory() instead.
|
|
int width, height, lossless, alpha;
|
|
int ok = twp_get_info("path/to/file.webp", &width, &height, &lossless, &alpha);
|
|
if (ok) { } // Do something with the info
|
|
|
|
Load YUV/YUVA:
|
|
|
|
// This will return NULL if the image is lossless, use twp_get_info() to check first.
|
|
// Note that WebP is 4:2:0, so width and height will be half for chroma.
|
|
int width, height;
|
|
unsigned char *data = twp_read("path/to/file.webp",
|
|
&width, &height,
|
|
twp_FORMAT_YUV, // or twp_FORMAT_YUVA
|
|
0); // no flags
|
|
if (data) {
|
|
unsigned char *y, *u, *v;
|
|
int luma_stride, chroma_stride;
|
|
twp_unpack_yuv(data, width, height,
|
|
&y, &u, &v, NULL, // alpha plane is NULL since we want YUV
|
|
&luma_stride, &chroma_stride, NULL); // alpha stride is NULL
|
|
// Do something with the YUV data
|
|
free(data); // y, u and v are now invalid. Do not free() them!
|
|
}
|
|
|
|
## Compilation
|
|
|
|
Copy tiny_webp.h into your project, then #define twp_IMPLEMENTATION in exactly one C/C++ file
|
|
that #includes tiny_webp.h. For example:
|
|
|
|
#include <stdio.h>
|
|
#include <stdlib.h>
|
|
#define twp_IMPLEMENTATION
|
|
#include "tiny_webp.h"
|
|
|
|
You can also do the #define twp_IMPLEMENTATION part at the end of a file. That way you don't see
|
|
all the private symbols in your autocomplete:
|
|
|
|
#include "tiny_webp.h"
|
|
|
|
int main()
|
|
{
|
|
...
|
|
}
|
|
|
|
#define twp_IMPLEMENTATION
|
|
#include "tiny_webp.h"
|
|
|
|
Alternatively, you could also just create a .c file that is empty except for the #define and the
|
|
#include.
|
|
|
|
## API
|
|
|
|
unsigned char *twp_read(const char *file_path, int *width, int *height,
|
|
twp_format format, twp_flags flags)
|
|
|
|
Loads a .webp image from a file. You must free the result.
|
|
|
|
Returns NULL on error.
|
|
|
|
Formats:
|
|
|
|
* twp_FORMAT_RGBA
|
|
* twp_FORMAT_RGB
|
|
* twp_FORMAT_YUV
|
|
* twp_FORMAT_YUVA
|
|
|
|
YUV and YUVA will return NULL for lossless images. Use twp_get_info() to first check if you have
|
|
a lossless or a lossy image.
|
|
|
|
For YUV and YUVA, you must call twp_unpack_yuv() on the result.
|
|
|
|
Flags:
|
|
|
|
* twp_FLAG_SKIP_LOOP_FILTER: Skips loop filtering, which saves some decoding time, but slightly
|
|
lowers quality.
|
|
|
|
--------
|
|
|
|
unsigned char *twp_read_from_memory(void *data, int data_len, int *width, int *height,
|
|
twp_format format, twp_flags flags)
|
|
|
|
The same as twp_read(), except it reads from memory instead.
|
|
|
|
--------
|
|
|
|
void twp_unpack_yuv(unsigned char *ptr, int width, int height,
|
|
unsigned char **y, unsigned char **u, unsigned char **v, unsigned char **a,
|
|
int *luma_stride, int *chroma_stride, int *alpha_stride)
|
|
|
|
If you loaded an image and requested either YUV or YUVA as the format, you must call this
|
|
function to unpack the returned pointer into the individual planes and get the strides.
|
|
|
|
Do not free the returned Y, U, V, A pointers. Just free the original pointer returned from
|
|
twp_read(). Y, U, V and A will be valid for as long as the original pointer is valid.
|
|
|
|
You can pass NULL if you don't care about a certain plane. For example, if you requested
|
|
format YUV, there is no alpha, so pass NULL for the a and alpha_stride parameters.
|
|
|
|
--------
|
|
|
|
int twp_get_info(const char *file_path, int *width, int *height, int *lossless, int *alpha)
|
|
|
|
Get information about a .webp file.
|
|
|
|
You can pass NULL if you don't care about something.
|
|
|
|
Returns 0 on error and 1 on success.
|
|
|
|
--------
|
|
|
|
int twp_get_info_from_memory(void *data, int data_len,
|
|
int *width, int *height, int *lossless, int *alpha)
|
|
|
|
The same as twp_get_info(), except it reads from memory instead.
|
|
|
|
## Compile Time Options
|
|
|
|
Simply #define any of these. twp_STATIC needs to be seen by the header and the implementation.
|
|
For all the others, only the implementation needs to see the #define. I recommend just putting
|
|
them at the top of tiny_webp.h.
|
|
|
|
* twp_STATIC: Define extern function to be static instead.
|
|
* twp_NO_SIMD: Disable all SIMD optimizations.
|
|
* twp_FORCE_SSE2: If for some reason SSE2 support wasn't correctly detected, you can force-
|
|
enable it.
|
|
* twp_SIGNED_RIGHT_SHIFT_FIX: This library heavily relies on right-shifting negative integers to
|
|
compile to arithmetic shifts. Enable this option if your compiler does not guarantee this (which
|
|
is unlikely, all major compilers do).
|
|
|
|
## Current Limitations
|
|
|
|
* Probably not as fast as libwebp
|
|
* No encoding
|
|
* No animations
|
|
* SIMD optimizations are SSE2 only
|
|
* License is not 0BSD, which would be my preference (but that's not my fault, see the next
|
|
section)
|
|
|
|
## License
|
|
|
|
As far as I understand, it's actually impossible to release an implementation of the WebP spec
|
|
under anything more permissive than BSD3. The reason for this is that the spec uses C code
|
|
everywhere, meaning if you read the code and then write an implementation, you have created a
|
|
derivative work. If the spec was written purely in English, this would not be a problem, because
|
|
implementing something described in English does not count as a derivative work of that
|
|
description. The spec further says that "the bitstream is defined by the reference source code
|
|
and not this narrative." So, really, the source code is the spec, and therefore any
|
|
implementation of the spec is a derivative work.
|
|
|
|
It seems to me, then, that a bunch of WebP/WebM/VP8 implementations floating around the web are
|
|
quite openly violating the license.
|
|
|
|
If Google could re-license the spec to 0BSD or something, that would be great.
|
|
|
|
|
|
Copyright (c) 2010, 2011, Google Inc. All rights reserved.
|
|
Copyright (c) 2025, justus2510
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
|
|
* Neither the name of Google nor the names of its contributors may
|
|
be used to endorse or promote products derived from this software
|
|
without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
|
|
|
|
|
#ifndef twp__HEADER_GUARD
|
|
#define twp__HEADER_GUARD
|
|
|
|
#ifdef twp_STATIC
|
|
#define twp__STORAGE static
|
|
#else
|
|
#ifdef __cplusplus
|
|
#define twp__STORAGE extern "C"
|
|
#else
|
|
#define twp__STORAGE extern
|
|
#endif
|
|
#endif
|
|
|
|
typedef int twp_flags;
|
|
#define twp_FLAG_SKIP_LOOP_FILTER (1 << 0)
|
|
|
|
typedef enum {
|
|
twp_FORMAT_RGBA,
|
|
twp_FORMAT_RGB,
|
|
twp_FORMAT_YUV,
|
|
twp_FORMAT_YUVA
|
|
} twp_format;
|
|
|
|
twp__STORAGE unsigned char *twp_read(const char *file_path, int *width, int *height,
|
|
twp_format format, twp_flags flags);
|
|
|
|
twp__STORAGE unsigned char *twp_read_from_memory(void *data, int data_len, int *width,
|
|
int *height, twp_format format, twp_flags flags);
|
|
|
|
twp__STORAGE void twp_unpack_yuv(unsigned char *ptr, int width, int height,
|
|
unsigned char **y, unsigned char **u, unsigned char **v, unsigned char **a,
|
|
int *luma_stride, int *chroma_stride, int *alpha_stride);
|
|
|
|
twp__STORAGE int twp_get_info(const char *file_path, int *width, int *height,
|
|
int *lossless, int *alpha);
|
|
|
|
twp__STORAGE int twp_get_info_from_memory(void *data, int data_len, int *width,
|
|
int *height, int *lossless, int *alpha);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined(twp_IMPLEMENTATION) && !defined(twp__IMPL_GUARD)
|
|
#define twp__IMPL_GUARD
|
|
|
|
#if defined(__GNUC__) || defined(__clang__)
|
|
#define twp__INLINE __attribute__((always_inline)) inline
|
|
#elif defined(_MSC_VER)
|
|
#define twp__INLINE __forceinline
|
|
#else
|
|
#define twp__INLINE inline
|
|
#endif
|
|
|
|
#if defined(__GNUC__) || defined(__clang__)
|
|
#define twp__trap() __builtin_trap()
|
|
#elif defined(_MSC_VER)
|
|
#define twp__trap() __debugbreak()
|
|
#else
|
|
#define twp__trap() do { *(int *)0 = 0; } while (0)
|
|
#endif
|
|
|
|
#ifdef twp__ENABLE_ASSERTS
|
|
#define twp__assert(x) do { if (!(x)) {fprintf(stdout, "Assert failed (line %i): %s\n", __LINE__, #x); twp__trap();} } while (0)
|
|
#else
|
|
#define twp__assert(x) do { } while (0)
|
|
#endif
|
|
|
|
#ifndef twp_NO_SIMD
|
|
#if defined(twp_FORCE_SSE2) || defined(__SSE2__) || defined(_M_X64) || (defined(_M_IX86) && defined(_M_IX86_FP) && (_M_IX86_FP >= 2))
|
|
#define twp__SSE2
|
|
#endif
|
|
#endif
|
|
|
|
#include <stdint.h>
|
|
#include <stddef.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <stdio.h>
|
|
#ifdef twp__SSE2
|
|
#include <emmintrin.h>
|
|
#endif
|
|
|
|
#if UINTPTR_MAX == 0xFFFFFFFFFFFFFFFF
|
|
#define twp__64BIT
|
|
#elif UINTPTR_MAX == 0xFFFFFFFF
|
|
#define twp__32BIT
|
|
#else
|
|
#define twp__32BIT // idk, just use 32-bit i guess?
|
|
#endif
|
|
|
|
#define twp__arrlen(x) ((int)(sizeof(x) / sizeof((x)[0])))
|
|
|
|
#define twp__MIN_FILE_SIZE 24
|
|
|
|
#define twp__MAX_VP8L_HUFFMAN_SYMBOL (256 + 24 + (1 << 11))
|
|
#define twp__MAX_VP8L_HUFFMAN_CODE_LENGTH 15
|
|
|
|
// from some rough testing, 8 seems good
|
|
// must be lower than 15 because 0xF encodes a special value in the huffman entries (see below)
|
|
#define twp__HUFFMAN_SPLIT 8
|
|
#define twp__HUFFMAN_SPLIT_MASK ((1 << twp__HUFFMAN_SPLIT) - 1)
|
|
|
|
#define twp__INVALID_HUFFMAN_SYMBOL_MASK 0xFFF0
|
|
|
|
typedef struct {
|
|
void *data;
|
|
int size;
|
|
} twp__chunk;
|
|
|
|
typedef struct {
|
|
twp__chunk VP8X;
|
|
twp__chunk VP8L;
|
|
twp__chunk VP8;
|
|
twp__chunk ALPH;
|
|
} twp__chunk_table;
|
|
|
|
typedef struct {
|
|
unsigned char *data;
|
|
int num_bytes;
|
|
int at_byte;
|
|
#ifdef twp__64BIT
|
|
uint64_t bits;
|
|
#else
|
|
uint32_t bits;
|
|
#endif
|
|
int num_bits;
|
|
} twp__bit_reader;
|
|
|
|
typedef struct twp__huffman_table {
|
|
//
|
|
// least significant 4 bits: the code length. since the maximum is 15 (most likely it's lower because of splitting), this fits.
|
|
// most significant 12 bits: the actual symbol. the maximum symbol value requires 12 bits, so this also fits.
|
|
// _ _ _ _ _ _ _ _ _ _ _ _|_ _ _ _
|
|
// msb | lsb
|
|
// symbol (12 bits) code length (4 bits)
|
|
//
|
|
// if the code length is 0xF, then symbol is instead an index into subtables
|
|
//
|
|
// if symbol == 0xFFF, then this entry is unoccupied and accessing it is an error
|
|
// 0xFFF is way above twp__MAX_VP8L_HUFFMAN_SYMBOL so this is fine
|
|
//
|
|
int max_code_length;
|
|
uint16_t *entries;
|
|
struct twp__huffman_table *subtables;
|
|
int num_subtables;
|
|
int max_subtables;
|
|
} twp__huffman_table;
|
|
|
|
typedef struct {
|
|
uint8_t r;
|
|
uint8_t g;
|
|
uint8_t b;
|
|
uint8_t a;
|
|
} twp__rgba8;
|
|
|
|
enum {
|
|
twp__TRANSFORM_PREDICTOR = 0,
|
|
twp__TRANSFORM_COLOR = 1,
|
|
twp__TRANSFORM_SUBTRACT_GREEN = 2,
|
|
twp__TRANSFORM_COLOR_INDEXING = 3,
|
|
twp__TRANSFORM_COUNT = 4
|
|
};
|
|
|
|
typedef struct {
|
|
int type;
|
|
|
|
struct {
|
|
int pow2;
|
|
twp__rgba8 *img;
|
|
int width;
|
|
int height;
|
|
} pred_col; // both the predictor and color transform use this
|
|
|
|
struct {
|
|
twp__rgba8 *color_table;
|
|
int color_table_size;
|
|
int orig_width;
|
|
int width_divider;
|
|
int divided_width;
|
|
} color_idxing;
|
|
} twp__transform;
|
|
|
|
typedef struct {
|
|
int exists;
|
|
int pow2;
|
|
int size;
|
|
twp__rgba8 *cache;
|
|
} twp__color_cache;
|
|
|
|
typedef struct {
|
|
int exists;
|
|
int pow2;
|
|
int width;
|
|
int height;
|
|
twp__rgba8 *img;
|
|
} twp__meta_prefix_img;
|
|
|
|
typedef struct {
|
|
twp__huffman_table arr[5];
|
|
} twp__prefix_code_group;
|
|
|
|
static const int twp__dist_mapping[120][2] = {
|
|
{0, 1}, {1, 0}, {1, 1}, {-1, 1}, {0, 2}, {2, 0}, {1, 2},
|
|
{-1, 2}, {2, 1}, {-2, 1}, {2, 2}, {-2, 2}, {0, 3}, {3, 0},
|
|
{1, 3}, {-1, 3}, {3, 1}, {-3, 1}, {2, 3}, {-2, 3}, {3, 2},
|
|
{-3, 2}, {0, 4}, {4, 0}, {1, 4}, {-1, 4}, {4, 1}, {-4, 1},
|
|
{3, 3}, {-3, 3}, {2, 4}, {-2, 4}, {4, 2}, {-4, 2}, {0, 5},
|
|
{3, 4}, {-3, 4}, {4, 3}, {-4, 3}, {5, 0}, {1, 5}, {-1, 5},
|
|
{5, 1}, {-5, 1}, {2, 5}, {-2, 5}, {5, 2}, {-5, 2}, {4, 4},
|
|
{-4, 4}, {3, 5}, {-3, 5}, {5, 3}, {-5, 3}, {0, 6}, {6, 0},
|
|
{1, 6}, {-1, 6}, {6, 1}, {-6, 1}, {2, 6}, {-2, 6}, {6, 2},
|
|
{-6, 2}, {4, 5}, {-4, 5}, {5, 4}, {-5, 4}, {3, 6}, {-3, 6},
|
|
{6, 3}, {-6, 3}, {0, 7}, {7, 0}, {1, 7}, {-1, 7}, {5, 5},
|
|
{-5, 5}, {7, 1}, {-7, 1}, {4, 6}, {-4, 6}, {6, 4}, {-6, 4},
|
|
{2, 7}, {-2, 7}, {7, 2}, {-7, 2}, {3, 7}, {-3, 7}, {7, 3},
|
|
{-7, 3}, {5, 6}, {-5, 6}, {6, 5}, {-6, 5}, {8, 0}, {4, 7},
|
|
{-4, 7}, {7, 4}, {-7, 4}, {8, 1}, {8, 2}, {6, 6}, {-6, 6},
|
|
{8, 3}, {5, 7}, {-5, 7}, {7, 5}, {-7, 5}, {8, 4}, {6, 7},
|
|
{-6, 7}, {7, 6}, {-7, 6}, {8, 5}, {7, 7}, {-7, 7}, {8, 6},
|
|
{8, 7},
|
|
};
|
|
|
|
twp__INLINE static int twp__sra(int x, int n)
|
|
{
|
|
#ifdef twp_SIGNED_RIGHT_SHIFT_FIX
|
|
if (x < 0)
|
|
return ~(~x >> n);
|
|
else
|
|
return x >> n;
|
|
#else
|
|
return x >> n;
|
|
#endif
|
|
}
|
|
|
|
static int twp__check_fourcc(unsigned char *fourcc, const char *want)
|
|
{
|
|
for (int i = 0; i < 4; ++i) {
|
|
if (fourcc[i] != want[i])
|
|
return 0;
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
twp__INLINE static int twp__div_round_up(int num, int den)
|
|
{
|
|
return (num + den - 1) / den;
|
|
}
|
|
|
|
twp__INLINE static int twp__abs(int n)
|
|
{
|
|
return (n < 0) ? -n : n;
|
|
}
|
|
|
|
twp__INLINE static int twp__clamp(int n, int low, int high)
|
|
{
|
|
if (n < low)
|
|
return low;
|
|
else if (n > high)
|
|
return high;
|
|
else
|
|
return n;
|
|
}
|
|
|
|
twp__INLINE static twp__rgba8 twp__make_rgba8(uint8_t r, uint8_t g, uint8_t b, uint8_t a)
|
|
{
|
|
twp__rgba8 result;
|
|
result.r = r;
|
|
result.g = g;
|
|
result.b = b;
|
|
result.a = a;
|
|
return result;
|
|
}
|
|
|
|
twp__INLINE static void twp__refill_bits(twp__bit_reader *reader)
|
|
{
|
|
#ifdef twp__64BIT
|
|
while (reader->num_bits <= 56 && reader->at_byte < reader->num_bytes) {
|
|
uint64_t byte = reader->data[reader->at_byte++];
|
|
reader->bits |= byte << reader->num_bits;
|
|
reader->num_bits += 8;
|
|
}
|
|
#else
|
|
while (reader->num_bits <= 24 && reader->at_byte < reader->num_bytes) {
|
|
uint32_t byte = reader->data[reader->at_byte++];
|
|
reader->bits |= byte << reader->num_bits;
|
|
reader->num_bits += 8;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
twp__INLINE static int twp__peek_bits_unsafe(twp__bit_reader *reader, int n)
|
|
{
|
|
int result = (int)(reader->bits & ((1 << n) - 1));
|
|
return result;
|
|
}
|
|
|
|
twp__INLINE static void twp__consume_bits(twp__bit_reader *reader, int n)
|
|
{
|
|
reader->bits >>= n;
|
|
reader->num_bits -= n;
|
|
}
|
|
|
|
twp__INLINE static int twp__read_bits(twp__bit_reader *reader, int n)
|
|
{
|
|
twp__assert(n >= 1 && n <= 31);
|
|
|
|
if (reader->num_bits < n) {
|
|
twp__refill_bits(reader);
|
|
if (reader->num_bits < n)
|
|
return -1;
|
|
}
|
|
int result = twp__peek_bits_unsafe(reader, n);
|
|
twp__consume_bits(reader, n);
|
|
|
|
return result;
|
|
}
|
|
|
|
static uint16_t *twp__allocate_huffman_entries(int count)
|
|
{
|
|
uint16_t *entries = (uint16_t *)malloc(count * sizeof(*entries));
|
|
for (int i = 0; i < count; ++i)
|
|
entries[i] = twp__INVALID_HUFFMAN_SYMBOL_MASK;
|
|
return entries;
|
|
}
|
|
|
|
static void twp__construct_simple_huffman_table(int a, int b, twp__huffman_table *table)
|
|
{
|
|
// this is needed for simple huffman codes because in that case the symbols not implicit
|
|
// in the order of the code length array, as is assumed in construct_huffman_table().
|
|
// however, this is also used in the special case when construct_huffman_table() is
|
|
// called with only one entry. not technically needed, but easier
|
|
|
|
table->subtables = NULL;
|
|
table->max_subtables = 0;
|
|
table->num_subtables = 0;
|
|
|
|
twp__assert(a >= 0);
|
|
if (b != -1 && a > b) {
|
|
int tmp = a;
|
|
a = b;
|
|
b = tmp;
|
|
}
|
|
|
|
if (b == -1) {
|
|
table->max_code_length = 0;
|
|
table->entries = twp__allocate_huffman_entries(1);
|
|
table->entries[0] = (uint16_t)(a << 4);
|
|
} else {
|
|
table->max_code_length = 1;
|
|
table->entries = twp__allocate_huffman_entries(2);
|
|
table->entries[0] = (uint16_t)(1 | (a << 4));
|
|
table->entries[1] = (uint16_t)(1 | (b << 4));
|
|
}
|
|
}
|
|
|
|
static void twp__free_huffman_table(twp__huffman_table *table)
|
|
{
|
|
for (int i = 0; i < table->num_subtables; ++i)
|
|
twp__free_huffman_table(table->subtables + i);
|
|
free(table->subtables);
|
|
free(table->entries);
|
|
memset(table, 0, sizeof(*table));
|
|
}
|
|
|
|
static int twp__insert_huffman_table_entry(twp__huffman_table *table, int code, int code_length,
|
|
int symbol, int max_subtable_code_length)
|
|
{
|
|
if (code_length <= table->max_code_length) {
|
|
int num_trash_bits = table->max_code_length - code_length;
|
|
int num_bits_to_fill = 1 << num_trash_bits;
|
|
for (int j = 0; j < num_bits_to_fill; ++j) {
|
|
int idx = code | (j << code_length);
|
|
if (idx >= (1 << table->max_code_length)) return 0;
|
|
if (table->entries[idx] != twp__INVALID_HUFFMAN_SYMBOL_MASK) return 0;
|
|
|
|
table->entries[idx] = (uint16_t)code_length | ((uint16_t)symbol << 4);
|
|
}
|
|
|
|
return 1;
|
|
} else {
|
|
int code_first_part = code & twp__HUFFMAN_SPLIT_MASK;
|
|
if (code_first_part >= (1 << table->max_code_length))
|
|
return 0;
|
|
|
|
twp__huffman_table *sub;
|
|
if (((table->entries[code_first_part] & 0xF)) == 0xF) { // subtable already exists
|
|
uint16_t entry = table->entries[code_first_part];
|
|
uint16_t idx = entry >> 4;
|
|
twp__assert(idx < (1 << twp__HUFFMAN_SPLIT));
|
|
twp__assert(idx < table->num_subtables);
|
|
sub = table->subtables + idx;
|
|
} else { // no subtable yet, insert one
|
|
if (table->entries[code_first_part] != twp__INVALID_HUFFMAN_SYMBOL_MASK)
|
|
return 0;
|
|
|
|
twp__assert(table->num_subtables < (1 << twp__HUFFMAN_SPLIT));
|
|
if (table->num_subtables >= table->max_subtables) {
|
|
table->max_subtables = table->max_subtables ? table->max_subtables*2 : 4;
|
|
table->subtables = (twp__huffman_table *)realloc(table->subtables, table->max_subtables * sizeof(twp__huffman_table));
|
|
}
|
|
|
|
int subtable_idx = table->num_subtables++;
|
|
sub = table->subtables + subtable_idx;
|
|
memset(sub, 0, sizeof(*sub));
|
|
sub->max_code_length = max_subtable_code_length;
|
|
sub->entries = twp__allocate_huffman_entries(1 << max_subtable_code_length);
|
|
table->entries[code_first_part] = (uint16_t)((subtable_idx << 4) | 0xF);
|
|
}
|
|
|
|
return twp__insert_huffman_table_entry(sub, code >> twp__HUFFMAN_SPLIT, code_length - twp__HUFFMAN_SPLIT, symbol, 0);
|
|
}
|
|
}
|
|
|
|
static int twp__construct_huffman_table(int *code_lengths, int num_code_lengths, twp__huffman_table *table)
|
|
{
|
|
if (num_code_lengths <= 0)
|
|
return 0;
|
|
|
|
int max_code_length = 0;
|
|
int nz_sym = -1;
|
|
int code_length_histogram[twp__MAX_VP8L_HUFFMAN_CODE_LENGTH + 1] = {0};
|
|
for (int i = 0; i < num_code_lengths; ++i) {
|
|
if (!code_lengths[i])
|
|
continue;
|
|
nz_sym = i;
|
|
++code_length_histogram[code_lengths[i]];
|
|
if (code_lengths[i] > max_code_length)
|
|
max_code_length = code_lengths[i];
|
|
}
|
|
|
|
if ((max_code_length == 1) && (code_length_histogram[1] == 1)) {
|
|
// special case when we have only one entry in the huffman table
|
|
twp__assert(nz_sym != -1);
|
|
twp__construct_simple_huffman_table(nz_sym, -1, table);
|
|
return 1;
|
|
}
|
|
|
|
if (max_code_length > twp__MAX_VP8L_HUFFMAN_CODE_LENGTH)
|
|
return 0; // should not be possible anyway
|
|
|
|
int max_main_table_code_length;
|
|
int max_subtable_code_length;
|
|
if (max_code_length > twp__HUFFMAN_SPLIT) {
|
|
max_main_table_code_length = twp__HUFFMAN_SPLIT;
|
|
max_subtable_code_length = max_code_length - twp__HUFFMAN_SPLIT;
|
|
} else {
|
|
max_main_table_code_length = max_code_length;
|
|
max_subtable_code_length = 0;
|
|
}
|
|
|
|
table->max_code_length = max_main_table_code_length;
|
|
table->entries = twp__allocate_huffman_entries(1 << max_main_table_code_length);
|
|
table->num_subtables = 0;
|
|
table->max_subtables = 0;
|
|
table->subtables = NULL;
|
|
|
|
int next_code[twp__MAX_VP8L_HUFFMAN_CODE_LENGTH + 1] = {0};
|
|
int current_code = 0;
|
|
for (int code_length = 1; code_length <= twp__MAX_VP8L_HUFFMAN_CODE_LENGTH; ++code_length) {
|
|
current_code = (current_code + code_length_histogram[code_length - 1]) << 1;
|
|
next_code[code_length] = current_code;
|
|
}
|
|
|
|
for (int symbol = 0; symbol < num_code_lengths; ++symbol) {
|
|
int code_length = code_lengths[symbol];
|
|
if (code_length == 0) continue;
|
|
twp__assert(code_length <= twp__MAX_VP8L_HUFFMAN_CODE_LENGTH);
|
|
twp__assert(symbol <= twp__MAX_VP8L_HUFFMAN_SYMBOL); // 12 bits
|
|
|
|
int code = next_code[code_length];
|
|
int reversed_code = 0;
|
|
for (int j = 0; j < code_length; ++j) {
|
|
int bit = (code >> j) & 1;
|
|
reversed_code |= bit << (code_length - j - 1);
|
|
}
|
|
|
|
if (!twp__insert_huffman_table_entry(table, reversed_code, code_length, symbol, max_subtable_code_length)) {
|
|
twp__free_huffman_table(table);
|
|
return 0;
|
|
}
|
|
|
|
++next_code[code_length];
|
|
}
|
|
|
|
// we don't actually know what the max code length for a subtable is until we built it
|
|
for (int i = 0; i < table->num_subtables; ++i) {
|
|
twp__huffman_table *sub = table->subtables + i;
|
|
int new_max_code_length = 0;
|
|
for (int j = 0; j < 1 << sub->max_code_length; ++j) {
|
|
if ((sub->entries[j] & 0xF) > new_max_code_length)
|
|
new_max_code_length = sub->entries[j] & 0xF;
|
|
}
|
|
sub->max_code_length = new_max_code_length;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
twp__INLINE static int twp__huffman_read(twp__bit_reader *reader, twp__huffman_table *ht)
|
|
{
|
|
if (ht->max_code_length == 0) return ht->entries[0] >> 4;
|
|
|
|
// refill
|
|
if (reader->num_bits < twp__MAX_VP8L_HUFFMAN_CODE_LENGTH) twp__refill_bits(reader);
|
|
|
|
// step 1
|
|
int bits = twp__peek_bits_unsafe(reader, ht->max_code_length);
|
|
uint16_t entry = ht->entries[bits];
|
|
|
|
// step 2, if needed
|
|
if ((entry & 0xF) == 0xF) {
|
|
twp__consume_bits(reader, twp__HUFFMAN_SPLIT);
|
|
ht = &ht->subtables[entry >> 4];
|
|
bits = twp__peek_bits_unsafe(reader, ht->max_code_length);
|
|
entry = ht->entries[bits];
|
|
}
|
|
|
|
if ((entry & twp__INVALID_HUFFMAN_SYMBOL_MASK) == twp__INVALID_HUFFMAN_SYMBOL_MASK) return -1;
|
|
twp__consume_bits(reader, entry & 0xF);
|
|
if (reader->num_bits < 0) return -1;
|
|
return entry >> 4;
|
|
}
|
|
|
|
twp__INLINE static int twp__get_meta_prefix_code(twp__rgba8 pix)
|
|
{
|
|
return ((int)pix.r << 8) | (int)pix.g;
|
|
}
|
|
|
|
twp__INLINE static int twp__hash_pixel(twp__rgba8 pix, int color_cache_pow2)
|
|
{
|
|
uint32_t color32 = ((uint32_t)pix.a << 24) | ((uint32_t)pix.r << 16) | ((uint32_t)pix.g << 8) | ((uint32_t)pix.b << 0);
|
|
uint32_t hash = (0x1e35a7bd * color32) >> (32 - color_cache_pow2);
|
|
return (int)hash;
|
|
}
|
|
|
|
twp__INLINE static int twp__decode_lz77(twp__bit_reader *reader, int code)
|
|
{
|
|
if (code < 0) {
|
|
return -1;
|
|
} else if (code < 4) {
|
|
return code + 1;
|
|
} else {
|
|
int num_extra_bits = (code - 2) >> 1;
|
|
int extra_bits = twp__read_bits(reader, num_extra_bits);
|
|
if (extra_bits == -1) return -1;
|
|
|
|
int offset = (2 + (code & 1)) << num_extra_bits;
|
|
return offset + extra_bits + 1;
|
|
}
|
|
}
|
|
|
|
static int twp__read_prefix_code(twp__bit_reader *reader, int color_cache_size, int group_idx, twp__huffman_table *result)
|
|
{
|
|
int is_simple = twp__read_bits(reader, 1);
|
|
if (is_simple == -1) return 0;
|
|
|
|
if (is_simple) {
|
|
int num_symbols = twp__read_bits(reader, 1) + 1;
|
|
if (num_symbols == 0) return 0;
|
|
|
|
int first_is_8bits = twp__read_bits(reader, 1);
|
|
if (first_is_8bits == -1) return 0;
|
|
|
|
int sym0 = twp__read_bits(reader, first_is_8bits ? 8 : 1);
|
|
if (sym0 == -1) return 0;
|
|
|
|
int sym1 = -1;
|
|
if (num_symbols == 2) {
|
|
sym1 = twp__read_bits(reader, 8);
|
|
if (sym1 == -1) return 0;
|
|
}
|
|
|
|
twp__construct_simple_huffman_table(sym0, sym1, result);
|
|
return 1;
|
|
} else {
|
|
// the huffman code lengths themselves are huffman coded
|
|
|
|
int num_stored_code_length_code_lengths = twp__read_bits(reader, 4) + 4;
|
|
if (num_stored_code_length_code_lengths == 3) return 0;
|
|
|
|
// i think the reason this uses a weird ordering like that is because that way
|
|
// it's more likely you can just skip the last ones?
|
|
enum { NUM_CODE_LENGTH_CODE_LENGTHS = 19 };
|
|
static const int code_length_code_order[NUM_CODE_LENGTH_CODE_LENGTHS] = {
|
|
17, 18, 0, 1, 2, 3, 4, 5, 16, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
|
};
|
|
|
|
int code_length_code_lengths[NUM_CODE_LENGTH_CODE_LENGTHS] = {0};
|
|
for (int j = 0; j < num_stored_code_length_code_lengths; ++j) {
|
|
int bits = twp__read_bits(reader, 3);
|
|
if (bits == -1) return 0;
|
|
|
|
code_length_code_lengths[code_length_code_order[j]] = bits;
|
|
}
|
|
|
|
// if the "mode" bit is 1, that means we should NOT decode until we have the entire
|
|
// alphabet (the alphabet size is specified by the current index of the group, see the
|
|
// switch statement below).
|
|
// instead, we decode exactly n times, and the rest of the code lengths will be 0.
|
|
|
|
int num_code_length_symbols; // this is the alphabet size
|
|
switch (group_idx) {
|
|
case 0: {
|
|
num_code_length_symbols = 256 + 24 + color_cache_size;
|
|
} break;
|
|
|
|
case 1:
|
|
case 2:
|
|
case 3: {
|
|
num_code_length_symbols = 256;
|
|
} break;
|
|
|
|
case 4: {
|
|
num_code_length_symbols = 40;
|
|
} break;
|
|
|
|
default: {
|
|
twp__assert(0);
|
|
return 0;
|
|
} break;
|
|
}
|
|
|
|
int mode = twp__read_bits(reader, 1);
|
|
if (mode == -1) return 0;
|
|
|
|
int num_code_length_symbols_to_decode;
|
|
if (mode == 0) {
|
|
num_code_length_symbols_to_decode = -1;
|
|
} else {
|
|
int bits_to_read = twp__read_bits(reader, 3);
|
|
if (bits_to_read == -1) return 0;
|
|
bits_to_read = 2 + 2*bits_to_read;
|
|
|
|
num_code_length_symbols_to_decode = twp__read_bits(reader, bits_to_read) + 2;
|
|
if (num_code_length_symbols_to_decode == 1) return 0;
|
|
}
|
|
|
|
twp__huffman_table code_lengths_ht;
|
|
if (!twp__construct_huffman_table(code_length_code_lengths, NUM_CODE_LENGTH_CODE_LENGTHS, &code_lengths_ht))
|
|
return 0;
|
|
|
|
int ok = 0;
|
|
int *code_lengths = (int *)calloc(num_code_length_symbols, sizeof(*code_lengths));
|
|
int count = 0;
|
|
for (;;) {
|
|
if (count >= num_code_length_symbols) break;
|
|
if (num_code_length_symbols_to_decode == 0) break;
|
|
|
|
int bits = twp__huffman_read(reader, &code_lengths_ht);
|
|
if (bits == -1) goto end;
|
|
if (num_code_length_symbols_to_decode != -1) --num_code_length_symbols_to_decode;
|
|
|
|
if (bits <= 15) {
|
|
code_lengths[count++] = bits;
|
|
} else if (bits == 16) {
|
|
int prev_non_zero = 8;
|
|
for (int i = count - 1; i >= 0; --i) {
|
|
if (code_lengths[i] != 0) {
|
|
prev_non_zero = code_lengths[i];
|
|
break;
|
|
}
|
|
}
|
|
|
|
int repeat_count = twp__read_bits(reader, 2);
|
|
if (repeat_count == -1) goto end;
|
|
repeat_count += 3;
|
|
if (count + repeat_count > num_code_length_symbols) goto end;
|
|
|
|
for (int i = 0; i < repeat_count; ++i)
|
|
code_lengths[count++] = prev_non_zero;
|
|
} else if (bits == 17 || bits == 18) {
|
|
int repeat_count = twp__read_bits(reader, (bits == 17) ? 3 : 7);
|
|
if (repeat_count == -1) goto end;
|
|
repeat_count += (bits == 17) ? 3 : 11;
|
|
if (count + repeat_count > num_code_length_symbols) goto end;
|
|
|
|
for (int i = 0; i < repeat_count; ++i)
|
|
code_lengths[count++] = 0;
|
|
} else {
|
|
twp__assert(0);
|
|
goto end;
|
|
}
|
|
}
|
|
|
|
twp__assert(count <= num_code_length_symbols);
|
|
if (num_code_length_symbols_to_decode != 0 && num_code_length_symbols_to_decode != -1)
|
|
goto end;
|
|
|
|
if (!twp__construct_huffman_table(code_lengths, num_code_length_symbols, result))
|
|
goto end;
|
|
|
|
ok = 1;
|
|
|
|
end:
|
|
free(code_lengths);
|
|
twp__free_huffman_table(&code_lengths_ht);
|
|
return ok;
|
|
}
|
|
}
|
|
|
|
static int twp__read_prefix_code_group(twp__bit_reader *reader, int color_cache_size, twp__prefix_code_group *result)
|
|
{
|
|
for (int i = 0; i < 5; ++i) {
|
|
if (!twp__read_prefix_code(reader, color_cache_size, i, result->arr + i)) {
|
|
for (int j = 0; j < i; ++j)
|
|
twp__free_huffman_table(result->arr + j);
|
|
return 0;
|
|
}
|
|
}
|
|
return 1;
|
|
}
|
|
|
|
static twp__rgba8 *twp__read_coded_image(twp__bit_reader *reader, int width, int height, int main)
|
|
{
|
|
twp__rgba8 *result = NULL;
|
|
twp__color_cache color_cache;
|
|
twp__meta_prefix_img meta_prefix;
|
|
twp__prefix_code_group *prefix_code_groups = NULL;
|
|
int num_prefix_code_groups = 0;
|
|
int num_pix = width * height;
|
|
|
|
memset(&color_cache, 0, sizeof(color_cache));
|
|
memset(&meta_prefix, 0, sizeof(meta_prefix));
|
|
|
|
color_cache.exists = twp__read_bits(reader, 1);
|
|
if (color_cache.exists == -1) goto err;
|
|
|
|
if (color_cache.exists) {
|
|
color_cache.pow2 = twp__read_bits(reader, 4);
|
|
if (color_cache.pow2 < 1 || color_cache.pow2 > 11) goto err;
|
|
|
|
color_cache.size = 1 << color_cache.pow2;
|
|
color_cache.cache = (twp__rgba8 *)calloc(color_cache.size, sizeof(twp__rgba8));
|
|
}
|
|
|
|
if (main) { // only the main image has meta prefix codes, sub images do not
|
|
meta_prefix.exists = twp__read_bits(reader, 1);
|
|
if (meta_prefix.exists == -1) goto err;
|
|
|
|
if (meta_prefix.exists) {
|
|
meta_prefix.pow2 = twp__read_bits(reader, 3) + 2;
|
|
if (meta_prefix.pow2 == 1) goto err;
|
|
|
|
meta_prefix.width = twp__div_round_up(width, 1 << meta_prefix.pow2);
|
|
meta_prefix.height = twp__div_round_up(height, 1 << meta_prefix.pow2);
|
|
meta_prefix.img = twp__read_coded_image(reader, meta_prefix.width, meta_prefix.height, 0);
|
|
if (!meta_prefix.img) goto err;
|
|
}
|
|
}
|
|
|
|
if (meta_prefix.exists) {
|
|
for (int i = 0; i < meta_prefix.width * meta_prefix.height; ++i) {
|
|
twp__rgba8 pix = meta_prefix.img[i];
|
|
int mpc = twp__get_meta_prefix_code(pix);
|
|
if (mpc > num_prefix_code_groups)
|
|
num_prefix_code_groups = mpc;
|
|
}
|
|
}
|
|
++num_prefix_code_groups;
|
|
|
|
prefix_code_groups = (twp__prefix_code_group *)malloc(num_prefix_code_groups * sizeof(twp__prefix_code_group));
|
|
for (int i = 0; i < num_prefix_code_groups; ++i) {
|
|
if (!twp__read_prefix_code_group(reader, color_cache.size, prefix_code_groups + i))
|
|
goto err;
|
|
}
|
|
|
|
result = (twp__rgba8 *)malloc(num_pix * 4);
|
|
for (int i = 0; i < num_pix;) {
|
|
int x = i % width;
|
|
int y = i / width;
|
|
|
|
twp__prefix_code_group *prefix_code_group;
|
|
if (meta_prefix.exists) {
|
|
int position = (y >> meta_prefix.pow2) * meta_prefix.width + (x >> meta_prefix.pow2);
|
|
if (position >= meta_prefix.width * meta_prefix.height) goto err;
|
|
|
|
int code = twp__get_meta_prefix_code(meta_prefix.img[position]);
|
|
if (code >= num_prefix_code_groups) goto err;
|
|
|
|
prefix_code_group = prefix_code_groups + code;
|
|
} else {
|
|
prefix_code_group = prefix_code_groups;
|
|
}
|
|
|
|
int S = twp__huffman_read(reader, &prefix_code_group->arr[0]);
|
|
if (S < 0) goto err;
|
|
if (S > 256 + 24 + color_cache.size - 1) goto err;
|
|
|
|
if (S < 256) {
|
|
int r = twp__huffman_read(reader, &prefix_code_group->arr[1]);
|
|
int g = S;
|
|
int b = twp__huffman_read(reader, &prefix_code_group->arr[2]);
|
|
int a = twp__huffman_read(reader, &prefix_code_group->arr[3]);
|
|
if (r == -1 || b == -1 || a == -1) goto err;
|
|
|
|
twp__rgba8 pix = twp__make_rgba8((uint8_t)r, (uint8_t)g, (uint8_t)b, (uint8_t)a);
|
|
|
|
if (color_cache.exists) {
|
|
int hash = twp__hash_pixel(pix, color_cache.pow2);
|
|
if (hash < 0 || hash >= color_cache.size) goto err;
|
|
color_cache.cache[hash] = pix;
|
|
}
|
|
|
|
result[i++] = pix;
|
|
} else if (S < 256 + 24) {
|
|
int length_code = S - 256;
|
|
int length = twp__decode_lz77(reader, length_code);
|
|
if (length == -1) goto err;
|
|
|
|
int dist_code = twp__huffman_read(reader, &prefix_code_group->arr[4]);
|
|
int dist = twp__decode_lz77(reader, dist_code);
|
|
if (dist == -1) goto err;
|
|
|
|
if (dist > 120) {
|
|
dist -= 120;
|
|
} else {
|
|
// this is safe to do because decode_lz77() always returns at least 1 (except when erroring, which is checked above)
|
|
int dx = twp__dist_mapping[dist - 1][0];
|
|
int dy = twp__dist_mapping[dist - 1][1];
|
|
dist = dy*width + dx;
|
|
if (dist < 1) dist = 1;
|
|
}
|
|
|
|
if (dist > i) goto err;
|
|
if (i + length > num_pix) goto err;
|
|
|
|
for (int j = 0; j < length; ++j)
|
|
result[i + j] = result[i - dist + j];
|
|
|
|
if (color_cache.exists) {
|
|
for (int j = 0; j < length; ++j) {
|
|
twp__rgba8 pix = result[i + j];
|
|
int hash = twp__hash_pixel(pix, color_cache.pow2);
|
|
if (hash < 0 || hash >= color_cache.size) goto err;
|
|
color_cache.cache[hash] = pix;
|
|
}
|
|
}
|
|
|
|
i += length;
|
|
} else {
|
|
if (!color_cache.exists) goto err;
|
|
|
|
int color_cache_idx = S - (256 + 24);
|
|
if (color_cache_idx < 0 || color_cache_idx >= color_cache.size) goto err;
|
|
|
|
result[i++] = color_cache.cache[color_cache_idx];
|
|
}
|
|
}
|
|
|
|
goto end;
|
|
|
|
err:
|
|
free(result);
|
|
result = NULL;
|
|
|
|
end:
|
|
free(color_cache.cache);
|
|
free(meta_prefix.img);
|
|
for (int i = 0; i < num_prefix_code_groups; ++i) {
|
|
for (int j = 0; j < 5; ++j) {
|
|
twp__free_huffman_table(&prefix_code_groups[i].arr[j]);
|
|
}
|
|
}
|
|
free(prefix_code_groups);
|
|
return result;
|
|
}
|
|
|
|
static void twp__free_transforms(twp__transform *tfs, int count)
|
|
{
|
|
for (int i = 0; i < count; ++i) {
|
|
twp__transform *tf = tfs + i;
|
|
|
|
switch (tf->type) {
|
|
case twp__TRANSFORM_PREDICTOR:
|
|
case twp__TRANSFORM_COLOR: {
|
|
free(tf->pred_col.img);
|
|
} break;
|
|
|
|
case twp__TRANSFORM_SUBTRACT_GREEN: {
|
|
// nothing to free
|
|
} break;
|
|
|
|
case twp__TRANSFORM_COLOR_INDEXING: {
|
|
free(tf->color_idxing.color_table);
|
|
} break;
|
|
|
|
default: {
|
|
twp__assert(0);
|
|
} break;
|
|
}
|
|
|
|
memset(tf, 0, sizeof(*tf));
|
|
}
|
|
}
|
|
|
|
static int twp__read_transform(twp__bit_reader *reader, twp__transform *tf, int *img_width,
|
|
int img_height, int *already_seen, int *finished)
|
|
{
|
|
*finished = 0;
|
|
|
|
int have_transform = twp__read_bits(reader, 1);
|
|
if (have_transform == -1) return 0;
|
|
if (have_transform == 0) {
|
|
*finished = 1;
|
|
return 1;
|
|
}
|
|
|
|
int type = twp__read_bits(reader, 2);
|
|
if (type == -1) return 0;
|
|
if (already_seen[type]) return 0;
|
|
already_seen[type] = 1;
|
|
tf->type = type;
|
|
|
|
switch (type) {
|
|
case twp__TRANSFORM_PREDICTOR:
|
|
case twp__TRANSFORM_COLOR: {
|
|
tf->pred_col.pow2 = twp__read_bits(reader, 3) + 2;
|
|
if (tf->pred_col.pow2 == 1) return 0;
|
|
tf->pred_col.width = twp__div_round_up(*img_width, 1 << tf->pred_col.pow2);
|
|
tf->pred_col.height = twp__div_round_up(img_height, 1 << tf->pred_col.pow2);
|
|
tf->pred_col.img = twp__read_coded_image(reader, tf->pred_col.width, tf->pred_col.height, 0);
|
|
if (!tf->pred_col.img) return 0;
|
|
} break;
|
|
|
|
case twp__TRANSFORM_SUBTRACT_GREEN: {
|
|
// no data
|
|
} break;
|
|
|
|
case twp__TRANSFORM_COLOR_INDEXING: {
|
|
tf->color_idxing.orig_width = *img_width;
|
|
tf->color_idxing.color_table_size = twp__read_bits(reader, 8) + 1;
|
|
|
|
if (tf->color_idxing.color_table_size == 0) return 0;
|
|
else if (tf->color_idxing.color_table_size <= 2) tf->color_idxing.width_divider = 1 << 3;
|
|
else if (tf->color_idxing.color_table_size <= 4) tf->color_idxing.width_divider = 1 << 2;
|
|
else if (tf->color_idxing.color_table_size <= 16) tf->color_idxing.width_divider = 1 << 1;
|
|
else tf->color_idxing.width_divider = 1;
|
|
|
|
tf->color_idxing.color_table = twp__read_coded_image(reader, tf->color_idxing.color_table_size, 1, 0);
|
|
if (!tf->color_idxing.color_table) return 0;
|
|
|
|
// the color table is delta coded
|
|
for (int i = 1; i < tf->color_idxing.color_table_size; ++i) {
|
|
tf->color_idxing.color_table[i].r += tf->color_idxing.color_table[i-1].r;
|
|
tf->color_idxing.color_table[i].g += tf->color_idxing.color_table[i-1].g;
|
|
tf->color_idxing.color_table[i].b += tf->color_idxing.color_table[i-1].b;
|
|
tf->color_idxing.color_table[i].a += tf->color_idxing.color_table[i-1].a;
|
|
}
|
|
|
|
tf->color_idxing.divided_width = twp__div_round_up(*img_width, tf->color_idxing.width_divider);
|
|
*img_width = tf->color_idxing.divided_width;
|
|
} break;
|
|
|
|
default: {
|
|
twp__assert(0);
|
|
return 0;
|
|
} break;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
static int twp__read_transforms(twp__bit_reader *reader, twp__transform *tfs, int img_width, int img_height)
|
|
{
|
|
int count = 0;
|
|
int already_seen[twp__TRANSFORM_COUNT] = {0};
|
|
int finished = 0;
|
|
while (!finished) {
|
|
if (count >= 4 || !twp__read_transform(reader, tfs + count, &img_width, img_height, already_seen, &finished)) {
|
|
twp__free_transforms(tfs, count);
|
|
return -1;
|
|
}
|
|
if (!finished)
|
|
++count;
|
|
}
|
|
return count;
|
|
}
|
|
|
|
twp__INLINE static twp__rgba8 twp__avg2(twp__rgba8 a, twp__rgba8 b)
|
|
{
|
|
twp__rgba8 res;
|
|
res.r = (a.r + b.r) / 2;
|
|
res.g = (a.g + b.g) / 2;
|
|
res.b = (a.b + b.b) / 2;
|
|
res.a = (a.a + b.a) / 2;
|
|
return res;
|
|
}
|
|
|
|
twp__INLINE static int twp__color_transform_delta(int t, int c)
|
|
{
|
|
t = (int8_t)t;
|
|
c = (int8_t)c;
|
|
return (int8_t)((t * c) >> 5);
|
|
}
|
|
|
|
static twp__rgba8 *twp__reverse_transform(twp__rgba8 *img, int width, int height, twp__transform *tf)
|
|
{
|
|
switch (tf->type) {
|
|
case twp__TRANSFORM_PREDICTOR: {
|
|
for (int y = 0; y < height; ++y) {
|
|
for (int x = 0; x < width; ++x) {
|
|
int i = y*width + x;
|
|
|
|
twp__rgba8 pred;
|
|
if (i == 0) {
|
|
pred = twp__make_rgba8(0, 0, 0, 255);
|
|
} else if (y == 0) {
|
|
pred = img[i - 1];
|
|
} else if (x == 0) {
|
|
pred = img[i - width];
|
|
} else {
|
|
twp__rgba8 left = img[i - 1];
|
|
twp__rgba8 top = img[i - width];
|
|
twp__rgba8 top_left = img[i - width - 1];
|
|
twp__rgba8 top_right = img[i - width + 1];
|
|
|
|
int subimg_i = (y >> tf->pred_col.pow2)*tf->pred_col.width + (x >> tf->pred_col.pow2);
|
|
int pred_mode = tf->pred_col.img[subimg_i].g;
|
|
if (pred_mode >= 14) {
|
|
free(img);
|
|
return NULL;
|
|
}
|
|
|
|
switch (pred_mode) {
|
|
case 0: {
|
|
pred.r = 0;
|
|
pred.g = 0;
|
|
pred.b = 0;
|
|
pred.a = 255;
|
|
} break;
|
|
|
|
case 1: pred = left; break;
|
|
case 2: pred = top; break;
|
|
case 3: pred = top_right; break;
|
|
case 4: pred = top_left; break;
|
|
case 5: pred = twp__avg2(twp__avg2(left, top_right), top); break;
|
|
case 6: pred = twp__avg2(left, top_left); break;
|
|
case 7: pred = twp__avg2(left, top); break;
|
|
case 8: pred = twp__avg2(top_left, top); break;
|
|
case 9: pred = twp__avg2(top, top_right); break;
|
|
case 10: pred = twp__avg2(twp__avg2(left, top_left), twp__avg2(top, top_right)); break;
|
|
|
|
case 11: {
|
|
int pr = left.r + top.r - top_left.r;
|
|
int pg = left.g + top.g - top_left.g;
|
|
int pb = left.b + top.b - top_left.b;
|
|
int pa = left.a + top.a - top_left.a;
|
|
|
|
// manhattan distances
|
|
int dist_left = twp__abs(pr - left.r) + twp__abs(pg - left.g) + twp__abs(pb - left.b) + twp__abs(pa - left.a);
|
|
int dist_top = twp__abs(pr - top.r) + twp__abs(pg - top.g) + twp__abs(pb - top.b) + twp__abs(pa - top.a);
|
|
|
|
if (dist_left < dist_top)
|
|
pred = left;
|
|
else
|
|
pred = top;
|
|
} break;
|
|
|
|
case 12: {
|
|
pred.r = (uint8_t)twp__clamp(left.r + top.r - top_left.r, 0, 255);
|
|
pred.g = (uint8_t)twp__clamp(left.g + top.g - top_left.g, 0, 255);
|
|
pred.b = (uint8_t)twp__clamp(left.b + top.b - top_left.b, 0, 255);
|
|
pred.a = (uint8_t)twp__clamp(left.a + top.a - top_left.a, 0, 255);
|
|
} break;
|
|
|
|
case 13: {
|
|
twp__rgba8 a = twp__avg2(left, top);
|
|
twp__rgba8 b = top_left;
|
|
|
|
pred.r = (uint8_t)twp__clamp(a.r + ((a.r - b.r) / 2), 0, 255);
|
|
pred.g = (uint8_t)twp__clamp(a.g + ((a.g - b.g) / 2), 0, 255);
|
|
pred.b = (uint8_t)twp__clamp(a.b + ((a.b - b.b) / 2), 0, 255);
|
|
pred.a = (uint8_t)twp__clamp(a.a + ((a.a - b.a) / 2), 0, 255);
|
|
} break;
|
|
|
|
default: {
|
|
twp__assert(0);
|
|
memset(&pred, 0, sizeof(pred));
|
|
} break;
|
|
}
|
|
}
|
|
|
|
img[i].r += pred.r;
|
|
img[i].g += pred.g;
|
|
img[i].b += pred.b;
|
|
img[i].a += pred.a;
|
|
}
|
|
}
|
|
|
|
return img;
|
|
} break;
|
|
|
|
case twp__TRANSFORM_COLOR: {
|
|
for (int y = 0; y < height; ++y) {
|
|
for (int x = 0; x < width; ++x) {
|
|
int i = y*width + x;
|
|
int subimg_i = (y >> tf->pred_col.pow2)*tf->pred_col.width + (x >> tf->pred_col.pow2);
|
|
|
|
twp__rgba8 pix = img[i];
|
|
twp__rgba8 sub_pix = tf->pred_col.img[subimg_i];
|
|
|
|
int red_to_blue = sub_pix.r;
|
|
int green_to_blue = sub_pix.g;
|
|
int green_to_red = sub_pix.b;
|
|
|
|
int tmp_red = pix.r;
|
|
int tmp_blue = pix.b;
|
|
|
|
tmp_red += twp__color_transform_delta(green_to_red, pix.g);
|
|
tmp_blue += twp__color_transform_delta(green_to_blue, pix.g);
|
|
tmp_blue += twp__color_transform_delta(red_to_blue, tmp_red);
|
|
|
|
img[i].r = (uint8_t)tmp_red;
|
|
img[i].b = (uint8_t)tmp_blue;
|
|
}
|
|
}
|
|
|
|
return img;
|
|
} break;
|
|
|
|
case twp__TRANSFORM_SUBTRACT_GREEN: {
|
|
for (int i = 0; i < width*height; ++i) {
|
|
img[i].r += img[i].g;
|
|
img[i].b += img[i].g;
|
|
}
|
|
|
|
return img;
|
|
} break;
|
|
|
|
case twp__TRANSFORM_COLOR_INDEXING: {
|
|
if (tf->color_idxing.width_divider == 1) {
|
|
// we can skip a bunch of work if there isn't any bit-packing
|
|
for (int i = 0; i < width*height; ++i) {
|
|
int color_table_idx = img[i].g;
|
|
if (color_table_idx >= tf->color_idxing.color_table_size)
|
|
img[i] = twp__make_rgba8(0, 0, 0, 0);
|
|
else
|
|
img[i] = tf->color_idxing.color_table[color_table_idx];
|
|
}
|
|
|
|
return img;
|
|
} else {
|
|
twp__assert(tf->color_idxing.width_divider == 2 || tf->color_idxing.width_divider == 4 || tf->color_idxing.width_divider == 8);
|
|
twp__assert(width == tf->color_idxing.divided_width);
|
|
|
|
// note that at the end of each row there can be unused bits,
|
|
// this happens when orig_width % width_divider != 0
|
|
|
|
int bits_per_idx = 8 / tf->color_idxing.width_divider;
|
|
int idx_bit_mask = (1 << bits_per_idx) - 1;
|
|
twp__rgba8 *new_img = (twp__rgba8 *)malloc(tf->color_idxing.orig_width * height * 4);
|
|
|
|
for (int y = 0; y < height; ++y) {
|
|
for (int x = 0; x < tf->color_idxing.orig_width; ++x) {
|
|
int packed = img[y*tf->color_idxing.divided_width + x/tf->color_idxing.width_divider].g;
|
|
int color_table_idx = (packed >> ((x % tf->color_idxing.width_divider) * bits_per_idx)) & idx_bit_mask;
|
|
if (color_table_idx >= tf->color_idxing.color_table_size)
|
|
new_img[y*tf->color_idxing.orig_width + x] = twp__make_rgba8(0, 0, 0, 0);
|
|
else
|
|
new_img[y*tf->color_idxing.orig_width + x] = tf->color_idxing.color_table[color_table_idx];
|
|
}
|
|
}
|
|
|
|
free(img);
|
|
return new_img;
|
|
}
|
|
} break;
|
|
|
|
default: {
|
|
twp__assert(0);
|
|
return img;
|
|
} break;
|
|
}
|
|
}
|
|
|
|
static int twp__read_vp8l_header(twp__bit_reader *reader, int *width, int *height)
|
|
{
|
|
int magic = twp__read_bits(reader, 8);
|
|
if (magic != 0x2f) return 0;
|
|
|
|
*width = twp__read_bits(reader, 14) + 1;
|
|
*height = twp__read_bits(reader, 14) + 1;
|
|
if (*width <= 0 || *height <= 0) return 0;
|
|
|
|
int alpha_is_used = twp__read_bits(reader, 1); // doesn't actually do anything
|
|
if (alpha_is_used == -1) return 0;
|
|
|
|
int version = twp__read_bits(reader, 3);
|
|
if (version != 0) return 0;
|
|
|
|
return 1;
|
|
}
|
|
|
|
static unsigned char *twp__read_vp8l(void *data, int data_len, int *width, int *height, twp_format format, int is_vp8_alpha)
|
|
{
|
|
if (format == twp_FORMAT_YUV || format == twp_FORMAT_YUVA)
|
|
return NULL;
|
|
|
|
twp__bit_reader reader;
|
|
memset(&reader, 0, sizeof(reader));
|
|
reader.data = (unsigned char *)data;
|
|
reader.num_bytes = data_len;
|
|
|
|
if (!is_vp8_alpha && !twp__read_vp8l_header(&reader, width, height))
|
|
return NULL;
|
|
|
|
twp__transform transforms[twp__TRANSFORM_COUNT];
|
|
int num_transforms = twp__read_transforms(&reader, transforms, *width, *height);
|
|
if (num_transforms == -1) return NULL;
|
|
|
|
// check if we have a color indexing transform; is so, we need to use its subsampled width
|
|
// for all image data that is processed *before* this transform is reversed
|
|
int divided_width = *width;
|
|
for (int i = 0; i < num_transforms; ++i) {
|
|
twp__transform *tf = transforms + i;
|
|
if (tf->type == twp__TRANSFORM_COLOR_INDEXING) {
|
|
divided_width = tf->color_idxing.divided_width;
|
|
break;
|
|
}
|
|
}
|
|
|
|
twp__rgba8 *img = twp__read_coded_image(&reader, divided_width, *height, 1);
|
|
|
|
int got_citf = 0;
|
|
for (int i = num_transforms - 1; img && i >= 0; --i) { // this gets skipped if twp__read_coded_image returned NULL
|
|
// we need to check if we already reversed a color indexing transform; if so, we need
|
|
// to stop using the subsampled width and start using the original width
|
|
twp__transform *tf = transforms + i;
|
|
|
|
int width_to_use = got_citf ? *width : divided_width;
|
|
img = twp__reverse_transform(img, width_to_use, *height, tf);
|
|
if (!img) break;
|
|
|
|
if (tf->type == twp__TRANSFORM_COLOR_INDEXING) {
|
|
twp__assert(!got_citf);
|
|
got_citf = 1;
|
|
}
|
|
}
|
|
twp__free_transforms(transforms, num_transforms);
|
|
|
|
if (img && format == twp_FORMAT_RGB) {
|
|
unsigned char *rgb = (unsigned char *)malloc(*width * *height * 3);
|
|
unsigned char *ptr = rgb;
|
|
for (int i = 0; i < *width * *height; ++i) {
|
|
*ptr++ = img[i].r;
|
|
*ptr++ = img[i].g;
|
|
*ptr++ = img[i].b;
|
|
}
|
|
free(img);
|
|
return rgb;
|
|
} else {
|
|
return (unsigned char *)img;
|
|
}
|
|
}
|
|
|
|
//
|
|
// vp8 decoding
|
|
//
|
|
|
|
// msvc does not optimize this endian-load pattern: https://developercommunity.visualstudio.com/t/Missed-optimization:-loadstore-coalesci/987039
|
|
// gcc and clang compile this to a mov+bswap, which is what we want.
|
|
// since msvc sucks, we have to resort to a hacky solution to get it to generate the correct code.
|
|
|
|
#ifdef twp__64BIT
|
|
typedef uint64_t twp__arith_dec_type;
|
|
#define twp__ARITH_DEC_SHIFT 56
|
|
#define twp__ARITH_DEC_READ_HELPER(buf) ((uint64_t)(buf)[7]<<0) | ((uint64_t)(buf)[6]<<8) | ((uint64_t)(buf)[5]<<16) | ((uint64_t)(buf)[4]<<24) | ((uint64_t)(buf)[3]<<32) | ((uint64_t)(buf)[2]<<40) | ((uint64_t)(buf)[1]<<48) | ((uint64_t)(buf)[0]<<56);
|
|
#else
|
|
typedef uint32_t twp__arith_dec_type;
|
|
#define twp__ARITH_DEC_SHIFT 24
|
|
#define twp__ARITH_DEC_READ_HELPER(buf) ((uint32_t)(buf)[3]<<0) | ((uint32_t)(buf)[2]<<8) | ((uint32_t)(buf)[1]<<16) | ((uint32_t)(buf)[0]<<24);
|
|
#endif
|
|
|
|
#ifdef _MSC_VER
|
|
#undef twp__ARITH_DEC_READ_HELPER
|
|
twp__INLINE twp__arith_dec_type twp__ARITH_DEC_READ_HELPER(uint8_t *buf)
|
|
{
|
|
// msvc (and every other compiler) optimizes the endian check out and emits just a mov+bswap
|
|
twp__arith_dec_type val;
|
|
memcpy(&val, buf, sizeof(twp__arith_dec_type));
|
|
uint16_t x = 0x1234;
|
|
int is_little_endian = (*(uint8_t *)&x == 0x34);
|
|
if (is_little_endian) {
|
|
#ifdef twp__64BIT
|
|
val = _byteswap_uint64(val);
|
|
#else
|
|
val = _byteswap_ulong(val);
|
|
#endif
|
|
}
|
|
return val;
|
|
}
|
|
#endif
|
|
|
|
typedef struct {
|
|
uint8_t *data;
|
|
int data_len;
|
|
int data_at;
|
|
twp__arith_dec_type value;
|
|
twp__arith_dec_type range;
|
|
int num_bits;
|
|
int err;
|
|
} twp__arith_dec;
|
|
|
|
// leading 0 bits for a byte value, used in arithmetic decoding
|
|
static const uint8_t twp__lz_table[256] = {
|
|
8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
// rest is 0
|
|
};
|
|
|
|
typedef enum {
|
|
twp__DCT_VAL_0,
|
|
twp__DCT_VAL_1,
|
|
twp__DCT_VAL_2,
|
|
twp__DCT_VAL_3,
|
|
twp__DCT_VAL_4,
|
|
twp__DCT_RANGE_0,
|
|
twp__DCT_RANGE_1,
|
|
twp__DCT_RANGE_2,
|
|
twp__DCT_RANGE_3,
|
|
twp__DCT_RANGE_4,
|
|
twp__DCT_RANGE_5,
|
|
twp__DCT_EOB,
|
|
twp__NUM_DCT_TOKENS
|
|
} twp__dct_token;
|
|
static const int twp__dct_token_tree[] = {
|
|
-twp__DCT_EOB, 2,
|
|
-twp__DCT_VAL_0, 4,
|
|
-twp__DCT_VAL_1, 6,
|
|
8, 12,
|
|
-twp__DCT_VAL_2, 10,
|
|
-twp__DCT_VAL_3, -twp__DCT_VAL_4,
|
|
14, 16,
|
|
-twp__DCT_RANGE_0, -twp__DCT_RANGE_1,
|
|
18, 20,
|
|
-twp__DCT_RANGE_2, -twp__DCT_RANGE_3,
|
|
-twp__DCT_RANGE_4, -twp__DCT_RANGE_5,
|
|
};
|
|
static const int twp__dct_range_probs[][12] = { // index with enum_val - twp__DCT_RANGE_0
|
|
{159, 0},
|
|
{165, 145, 0},
|
|
{173, 148, 140, 0},
|
|
{176, 155, 140, 135, 0},
|
|
{180, 157, 141, 134, 130, 0},
|
|
{254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0},
|
|
};
|
|
static const int twp__dct_range_base[] = {5, 7, 11, 19, 35, 67};
|
|
|
|
typedef enum {
|
|
twp__DCT_TYPE_Y_WITHOUT_DC,
|
|
twp__DCT_TYPE_Y2,
|
|
twp__DCT_TYPE_UV,
|
|
twp__DCT_TYPE_Y_WITH_DC,
|
|
twp__NUM_DCT_TYPES
|
|
} twp__dct_type;
|
|
|
|
typedef enum {
|
|
twp__DCT_PLANE_Y,
|
|
twp__DCT_PLANE_Y2,
|
|
twp__DCT_PLANE_U,
|
|
twp__DCT_PLANE_V,
|
|
twp__NUM_DCT_PLANES
|
|
} twp__dct_plane;
|
|
|
|
typedef enum {
|
|
twp__DC,
|
|
twp__AC,
|
|
twp__NUM_DCT_COEFF_TYPES
|
|
} twp__dct_coeff_type;
|
|
|
|
#define twp__COEFF_BAND_MAX 7
|
|
static const int twp__coeff_bands[16] = {
|
|
0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
|
|
};
|
|
|
|
#define twp__NUM_DCT_CTXS 3
|
|
typedef int twp__coeff_prob_type[twp__NUM_DCT_TYPES][twp__COEFF_BAND_MAX+1][twp__NUM_DCT_CTXS][twp__NUM_DCT_TOKENS-1];
|
|
|
|
static const twp__coeff_prob_type twp__coeff_update_probs = {
|
|
{
|
|
{
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255},
|
|
{250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255},
|
|
{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
}
|
|
},
|
|
{
|
|
{
|
|
{217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255},
|
|
{234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
}
|
|
},
|
|
{
|
|
{
|
|
{186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255},
|
|
{251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
}
|
|
},
|
|
{
|
|
{
|
|
{248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255},
|
|
{248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
},
|
|
{
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
{255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255},
|
|
}
|
|
}
|
|
};
|
|
|
|
static const twp__coeff_prob_type twp__default_coeff_probs = {
|
|
{
|
|
{
|
|
{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
},
|
|
{
|
|
{253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128},
|
|
{189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128},
|
|
{106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128},
|
|
{181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128},
|
|
{ 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128},
|
|
{184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128},
|
|
{ 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128},
|
|
{170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128},
|
|
{ 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128},
|
|
{207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128},
|
|
{102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128},
|
|
{177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128},
|
|
{ 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
{246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
{255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
}
|
|
},
|
|
{
|
|
{
|
|
{198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62},
|
|
{131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1},
|
|
{ 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128},
|
|
},
|
|
{
|
|
{ 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128},
|
|
{184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128},
|
|
{ 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128},
|
|
},
|
|
{
|
|
{ 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128},
|
|
{ 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128},
|
|
{ 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128},
|
|
},
|
|
{
|
|
{ 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128},
|
|
{109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128},
|
|
{ 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128},
|
|
{ 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128},
|
|
{ 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128},
|
|
{124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128},
|
|
{ 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128},
|
|
{121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128},
|
|
{ 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128},
|
|
{203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128},
|
|
{137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128},
|
|
}
|
|
},
|
|
{
|
|
{
|
|
{253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128},
|
|
{175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128},
|
|
{ 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128},
|
|
},
|
|
{
|
|
{ 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128},
|
|
{239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128},
|
|
{155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128},
|
|
{201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128},
|
|
{ 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128},
|
|
{223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128},
|
|
{141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128},
|
|
{190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128},
|
|
{149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
{247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
{240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128},
|
|
{213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128},
|
|
{ 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
},
|
|
{
|
|
{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
{128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
}
|
|
},
|
|
{
|
|
{
|
|
{202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255},
|
|
{126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128},
|
|
{ 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128},
|
|
},
|
|
{
|
|
{ 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128},
|
|
{166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128},
|
|
{ 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128},
|
|
},
|
|
{
|
|
{ 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128},
|
|
{124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128},
|
|
{ 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128},
|
|
},
|
|
{
|
|
{ 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128},
|
|
{149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128},
|
|
{ 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128},
|
|
},
|
|
{
|
|
{ 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128},
|
|
{123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128},
|
|
{ 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128},
|
|
{168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128},
|
|
{ 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128},
|
|
{141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128},
|
|
{ 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128},
|
|
},
|
|
{
|
|
{ 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
{244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
{238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128},
|
|
},
|
|
},
|
|
};
|
|
|
|
typedef enum {
|
|
twp__MB_MODE_DC_PRED,
|
|
twp__MB_MODE_V_PRED,
|
|
twp__MB_MODE_H_PRED,
|
|
twp__MB_MODE_TM_PRED,
|
|
twp__MB_MODE_B_PRED,
|
|
|
|
twp__NUM_UV_MB_MODES = twp__MB_MODE_B_PRED,
|
|
twp__NUM_Y_MB_MODES
|
|
} twp__mb_mode;
|
|
static const int twp__mb_y_mode_tree[] = {
|
|
-twp__MB_MODE_B_PRED, 2,
|
|
4, 6,
|
|
-twp__MB_MODE_DC_PRED, -twp__MB_MODE_V_PRED,
|
|
-twp__MB_MODE_H_PRED, -twp__MB_MODE_TM_PRED,
|
|
};
|
|
static const int twp__mb_y_mode_tree_probs[] = {145, 156, 163, 128};
|
|
static const int twp__mb_uv_mode_tree[] = {
|
|
-twp__MB_MODE_DC_PRED, 2,
|
|
-twp__MB_MODE_V_PRED, 4,
|
|
-twp__MB_MODE_H_PRED, -twp__MB_MODE_TM_PRED,
|
|
};
|
|
static const int twp__mb_uv_mode_tree_probs[] = {142, 114, 183};
|
|
|
|
typedef enum {
|
|
twp__SB_MODE_DC_PRED,
|
|
twp__SB_MODE_TM_PRED,
|
|
|
|
twp__SB_MODE_VE_PRED,
|
|
twp__SB_MODE_HE_PRED,
|
|
|
|
twp__SB_MODE_LD_PRED,
|
|
twp__SB_MODE_RD_PRED,
|
|
|
|
twp__SB_MODE_VR_PRED,
|
|
twp__SB_MODE_VL_PRED,
|
|
twp__SB_MODE_HD_PRED,
|
|
twp__SB_MODE_HU_PRED,
|
|
|
|
twp__NUM_SUBBLOCK_MODES
|
|
} twp__sb_mode;
|
|
static const int twp__sb_mode_tree[] = {
|
|
-twp__SB_MODE_DC_PRED, 2,
|
|
-twp__SB_MODE_TM_PRED, 4,
|
|
-twp__SB_MODE_VE_PRED, 6,
|
|
8, 12,
|
|
-twp__SB_MODE_HE_PRED, 10,
|
|
-twp__SB_MODE_RD_PRED, -twp__SB_MODE_VR_PRED,
|
|
-twp__SB_MODE_LD_PRED, 14,
|
|
-twp__SB_MODE_VL_PRED, 16,
|
|
-twp__SB_MODE_HD_PRED, -twp__SB_MODE_HU_PRED,
|
|
};
|
|
static const int twp__sb_mode_tree_probs[twp__NUM_SUBBLOCK_MODES][twp__NUM_SUBBLOCK_MODES][twp__NUM_SUBBLOCK_MODES-1] = {
|
|
{
|
|
{231, 120, 48, 89, 115, 113, 120, 152, 112},
|
|
{152, 179, 64, 126, 170, 118, 46, 70, 95},
|
|
{175, 69, 143, 80, 85, 82, 72, 155, 103},
|
|
{ 56, 58, 10, 171, 218, 189, 17, 13, 152},
|
|
{144, 71, 10, 38, 171, 213, 144, 34, 26},
|
|
{114, 26, 17, 163, 44, 195, 21, 10, 173},
|
|
{121, 24, 80, 195, 26, 62, 44, 64, 85},
|
|
{170, 46, 55, 19, 136, 160, 33, 206, 71},
|
|
{ 63, 20, 8, 114, 114, 208, 12, 9, 226},
|
|
{ 81, 40, 11, 96, 182, 84, 29, 16, 36},
|
|
},
|
|
{
|
|
{134, 183, 89, 137, 98, 101, 106, 165, 148},
|
|
{ 72, 187, 100, 130, 157, 111, 32, 75, 80},
|
|
{ 66, 102, 167, 99, 74, 62, 40, 234, 128},
|
|
{ 41, 53, 9, 178, 241, 141, 26, 8, 107},
|
|
{104, 79, 12, 27, 217, 255, 87, 17, 7},
|
|
{ 74, 43, 26, 146, 73, 166, 49, 23, 157},
|
|
{ 65, 38, 105, 160, 51, 52, 31, 115, 128},
|
|
{ 87, 68, 71, 44, 114, 51, 15, 186, 23},
|
|
{ 47, 41, 14, 110, 182, 183, 21, 17, 194},
|
|
{ 66, 45, 25, 102, 197, 189, 23, 18, 22},
|
|
},
|
|
{
|
|
{ 88, 88, 147, 150, 42, 46, 45, 196, 205},
|
|
{ 43, 97, 183, 117, 85, 38, 35, 179, 61},
|
|
{ 39, 53, 200, 87, 26, 21, 43, 232, 171},
|
|
{ 56, 34, 51, 104, 114, 102, 29, 93, 77},
|
|
{107, 54, 32, 26, 51, 1, 81, 43, 31},
|
|
{ 39, 28, 85, 171, 58, 165, 90, 98, 64},
|
|
{ 34, 22, 116, 206, 23, 34, 43, 166, 73},
|
|
{ 68, 25, 106, 22, 64, 171, 36, 225, 114},
|
|
{ 34, 19, 21, 102, 132, 188, 16, 76, 124},
|
|
{ 62, 18, 78, 95, 85, 57, 50, 48, 51},
|
|
},
|
|
{
|
|
{193, 101, 35, 159, 215, 111, 89, 46, 111},
|
|
{ 60, 148, 31, 172, 219, 228, 21, 18, 111},
|
|
{112, 113, 77, 85, 179, 255, 38, 120, 114},
|
|
{ 40, 42, 1, 196, 245, 209, 10, 25, 109},
|
|
{100, 80, 8, 43, 154, 1, 51, 26, 71},
|
|
{ 88, 43, 29, 140, 166, 213, 37, 43, 154},
|
|
{ 61, 63, 30, 155, 67, 45, 68, 1, 209},
|
|
{142, 78, 78, 16, 255, 128, 34, 197, 171},
|
|
{ 41, 40, 5, 102, 211, 183, 4, 1, 221},
|
|
{ 51, 50, 17, 168, 209, 192, 23, 25, 82},
|
|
},
|
|
{
|
|
{125, 98, 42, 88, 104, 85, 117, 175, 82},
|
|
{ 95, 84, 53, 89, 128, 100, 113, 101, 45},
|
|
{ 75, 79, 123, 47, 51, 128, 81, 171, 1},
|
|
{ 57, 17, 5, 71, 102, 57, 53, 41, 49},
|
|
{115, 21, 2, 10, 102, 255, 166, 23, 6},
|
|
{ 38, 33, 13, 121, 57, 73, 26, 1, 85},
|
|
{ 41, 10, 67, 138, 77, 110, 90, 47, 114},
|
|
{101, 29, 16, 10, 85, 128, 101, 196, 26},
|
|
{ 57, 18, 10, 102, 102, 213, 34, 20, 43},
|
|
{117, 20, 15, 36, 163, 128, 68, 1, 26},
|
|
},
|
|
{
|
|
{138, 31, 36, 171, 27, 166, 38, 44, 229},
|
|
{ 67, 87, 58, 169, 82, 115, 26, 59, 179},
|
|
{ 63, 59, 90, 180, 59, 166, 93, 73, 154},
|
|
{ 40, 40, 21, 116, 143, 209, 34, 39, 175},
|
|
{ 57, 46, 22, 24, 128, 1, 54, 17, 37},
|
|
{ 47, 15, 16, 183, 34, 223, 49, 45, 183},
|
|
{ 46, 17, 33, 183, 6, 98, 15, 32, 183},
|
|
{ 65, 32, 73, 115, 28, 128, 23, 128, 205},
|
|
{ 40, 3, 9, 115, 51, 192, 18, 6, 223},
|
|
{ 87, 37, 9, 115, 59, 77, 64, 21, 47},
|
|
},
|
|
{
|
|
{104, 55, 44, 218, 9, 54, 53, 130, 226},
|
|
{ 64, 90, 70, 205, 40, 41, 23, 26, 57},
|
|
{ 54, 57, 112, 184, 5, 41, 38, 166, 213},
|
|
{ 30, 34, 26, 133, 152, 116, 10, 32, 134},
|
|
{ 75, 32, 12, 51, 192, 255, 160, 43, 51},
|
|
{ 39, 19, 53, 221, 26, 114, 32, 73, 255},
|
|
{ 31, 9, 65, 234, 2, 15, 1, 118, 73},
|
|
{ 88, 31, 35, 67, 102, 85, 55, 186, 85},
|
|
{ 56, 21, 23, 111, 59, 205, 45, 37, 192},
|
|
{ 55, 38, 70, 124, 73, 102, 1, 34, 98},
|
|
},
|
|
{
|
|
{102, 61, 71, 37, 34, 53, 31, 243, 192},
|
|
{ 69, 60, 71, 38, 73, 119, 28, 222, 37},
|
|
{ 68, 45, 128, 34, 1, 47, 11, 245, 171},
|
|
{ 62, 17, 19, 70, 146, 85, 55, 62, 70},
|
|
{ 75, 15, 9, 9, 64, 255, 184, 119, 16},
|
|
{ 37, 43, 37, 154, 100, 163, 85, 160, 1},
|
|
{ 63, 9, 92, 136, 28, 64, 32, 201, 85},
|
|
{ 86, 6, 28, 5, 64, 255, 25, 248, 1},
|
|
{ 56, 8, 17, 132, 137, 255, 55, 116, 128},
|
|
{ 58, 15, 20, 82, 135, 57, 26, 121, 40},
|
|
},
|
|
{
|
|
{164, 50, 31, 137, 154, 133, 25, 35, 218},
|
|
{ 51, 103, 44, 131, 131, 123, 31, 6, 158},
|
|
{ 86, 40, 64, 135, 148, 224, 45, 183, 128},
|
|
{ 22, 26, 17, 131, 240, 154, 14, 1, 209},
|
|
{ 83, 12, 13, 54, 192, 255, 68, 47, 28},
|
|
{ 45, 16, 21, 91, 64, 222, 7, 1, 197},
|
|
{ 56, 21, 39, 155, 60, 138, 23, 102, 213},
|
|
{ 85, 26, 85, 85, 128, 128, 32, 146, 171},
|
|
{ 18, 11, 7, 63, 144, 171, 4, 4, 246},
|
|
{ 35, 27, 10, 146, 174, 171, 12, 26, 128},
|
|
},
|
|
{
|
|
{190, 80, 35, 99, 180, 80, 126, 54, 45},
|
|
{ 85, 126, 47, 87, 176, 51, 41, 20, 32},
|
|
{101, 75, 128, 139, 118, 146, 116, 128, 85},
|
|
{ 56, 41, 15, 176, 236, 85, 37, 9, 62},
|
|
{146, 36, 19, 30, 171, 255, 97, 27, 20},
|
|
{ 71, 30, 17, 119, 118, 255, 17, 18, 138},
|
|
{101, 38, 60, 138, 55, 70, 43, 26, 142},
|
|
{138, 45, 61, 62, 219, 1, 81, 188, 64},
|
|
{ 32, 41, 20, 117, 151, 142, 20, 21, 163},
|
|
{112, 19, 12, 61, 195, 128, 48, 4, 24},
|
|
},
|
|
};
|
|
|
|
static const int twp__segment_id_tree[] = {
|
|
2, 4,
|
|
-0, -1,
|
|
-2, -3,
|
|
};
|
|
|
|
#define twp__QUANT_TABLE_SIZE 128
|
|
|
|
static const int twp__quant_tables[twp__NUM_DCT_COEFF_TYPES][twp__QUANT_TABLE_SIZE] = {
|
|
{ // dc
|
|
4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15,
|
|
16, 17, 17, 18, 19, 20, 20, 21, 21, 22, 22, 23, 23,
|
|
24, 25, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
|
|
36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 46,
|
|
47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
|
|
60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
|
|
73, 74, 75, 76, 76, 77, 78, 79, 80, 81, 82, 83, 84,
|
|
85, 86, 87, 88, 89, 91, 93, 95, 96, 98, 100, 101, 102,
|
|
104, 106, 108, 110, 112, 114, 116, 118, 122, 124, 126, 128, 130,
|
|
132, 134, 136, 138, 140, 143, 145, 148, 151, 154, 157,
|
|
},
|
|
{ // ac
|
|
4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
|
|
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
|
|
30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
|
|
43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
|
|
56, 57, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78,
|
|
80, 82, 84, 86, 88, 90, 92, 94, 96, 98, 100, 102, 104,
|
|
106, 108, 110, 112, 114, 116, 119, 122, 125, 128, 131, 134, 137,
|
|
140, 143, 146, 149, 152, 155, 158, 161, 164, 167, 170, 173, 177,
|
|
181, 185, 189, 193, 197, 201, 205, 209, 213, 217, 221, 225, 229,
|
|
234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284,
|
|
},
|
|
};
|
|
|
|
static const int twp__zigzag[] = {
|
|
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15,
|
|
};
|
|
|
|
#define twp__MAX_PARITIONS 8
|
|
|
|
typedef struct {
|
|
int offset;
|
|
int size;
|
|
twp__arith_dec dec;
|
|
} twp__dct_partition;
|
|
|
|
#define twp__MAX_SEGMENTS 4
|
|
|
|
typedef enum {
|
|
twp__SEGMENT_MODE_DELTA,
|
|
twp__SEGMENT_MODE_ABSOLUTE
|
|
} twp__segment_mode;
|
|
|
|
typedef struct {
|
|
int quant_update;
|
|
int lf_update;
|
|
} twp__segment;
|
|
|
|
typedef struct {
|
|
// this is a relatively big allocation, so try to keep it reasonably sized
|
|
uint8_t skip;
|
|
uint8_t segment_id;
|
|
uint8_t y_mode; // twp__mb_mode
|
|
uint8_t y_sb_modes[16]; // twp__sb_mode
|
|
uint8_t uv_mode; // twp__mb_mode
|
|
uint8_t skip_sb_filtering;
|
|
int x;
|
|
int y;
|
|
} twp__mb_info;
|
|
|
|
typedef enum {
|
|
twp__FILTER_NORMAL,
|
|
twp__FILTER_SIMPLE
|
|
} twp__filter_type;
|
|
|
|
typedef enum {
|
|
twp__UPSCALING_NONE,
|
|
twp__UPSCALING_5_OVER_4,
|
|
twp__UPSCALING_5_OVER_3,
|
|
twp__UPSCALING_2
|
|
} twp__upscaling;
|
|
|
|
typedef struct {
|
|
int width;
|
|
int height;
|
|
int luma_width;
|
|
int luma_height;
|
|
int luma_stride;
|
|
int chroma_width;
|
|
int chroma_height;
|
|
int chroma_stride;
|
|
|
|
twp__upscaling upscaling_x;
|
|
twp__upscaling upscaling_y;
|
|
|
|
twp__mb_info *mb_infos;
|
|
int num_mbs;
|
|
int mbs_per_row;
|
|
int mbs_per_col;
|
|
int num_sbs;
|
|
int sbs_per_row;
|
|
int sbs_per_col;
|
|
|
|
int require_clamping;
|
|
|
|
twp__filter_type loop_filter_type;
|
|
int loop_filter_level;
|
|
int loop_filter_sharpness;
|
|
|
|
int lf_adj_enabled;
|
|
int lf_adj_ref_frame[4];
|
|
int lf_adj_mb_mode[4];
|
|
|
|
int frame_quant_idx_base;
|
|
int quant_idx_deltas[twp__NUM_DCT_PLANES][twp__NUM_DCT_COEFF_TYPES];
|
|
|
|
int segmentation_enabled;
|
|
int all_mbs_are_segment_0;
|
|
twp__segment_mode segment_mode;
|
|
twp__segment segments[twp__MAX_SEGMENTS];
|
|
int segment_id_tree_probs[3];
|
|
|
|
int first_partition_size;
|
|
twp__arith_dec first_partition_dec;
|
|
|
|
int num_dct_partitions;
|
|
twp__dct_partition dct_partitions[twp__MAX_PARITIONS];
|
|
|
|
twp__coeff_prob_type coeff_probs;
|
|
|
|
uint8_t *plane_y;
|
|
uint8_t *plane_u;
|
|
uint8_t *plane_v;
|
|
} twp__vp8_data;
|
|
|
|
#define twp__UNCOMPRESSED_VP8_HEADER_SIZE 10
|
|
|
|
static void twp__init_arith_decoder(twp__arith_dec *dec, uint8_t *data, int data_len)
|
|
{
|
|
twp__assert(data_len >= 1);
|
|
|
|
dec->data = data;
|
|
dec->data_len = data_len;
|
|
dec->data_at = 1;
|
|
dec->value = (twp__arith_dec_type)data[0] << twp__ARITH_DEC_SHIFT;
|
|
dec->range = 255;
|
|
dec->num_bits = 0;
|
|
dec->err = 0;
|
|
}
|
|
|
|
twp__INLINE static int twp__read_arith_bit(twp__arith_dec *dec, int prob)
|
|
{
|
|
uint8_t shift = twp__lz_table[dec->range];
|
|
dec->range <<= shift;
|
|
dec->value <<= shift;
|
|
dec->num_bits -= shift;
|
|
if (dec->num_bits < 0) {
|
|
int bytes_left = dec->data_len - dec->data_at;
|
|
if (bytes_left >= (int)sizeof(twp__arith_dec_type)) {
|
|
twp__arith_dec_type val = twp__ARITH_DEC_READ_HELPER(dec->data + dec->data_at);
|
|
dec->value |= val >> (8 + dec->num_bits);
|
|
dec->num_bits += twp__ARITH_DEC_SHIFT;
|
|
dec->data_at += sizeof(twp__arith_dec_type)-1;
|
|
} else if (bytes_left >= 1) {
|
|
for (int i = 0; i < bytes_left; ++i)
|
|
dec->value |= (twp__arith_dec_type)dec->data[dec->data_at++] << ((twp__ARITH_DEC_SHIFT-8 - i*8) - dec->num_bits);
|
|
dec->num_bits += bytes_left * 8;
|
|
} else {
|
|
dec->err = 1;
|
|
}
|
|
}
|
|
|
|
twp__arith_dec_type split = 1 + (((dec->range - 1) * (twp__arith_dec_type)prob) >> 8);
|
|
int result;
|
|
if (dec->value < (split << twp__ARITH_DEC_SHIFT)) {
|
|
dec->range = split;
|
|
result = 0;
|
|
} else {
|
|
dec->range -= split;
|
|
dec->value -= split << twp__ARITH_DEC_SHIFT;
|
|
result = 1;
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
twp__INLINE static int twp__read_arith_literal(twp__arith_dec *dec, int n)
|
|
{
|
|
twp__assert(n >= 1 && n <= 31);
|
|
int result = 0;
|
|
for (int i = 0; i < n; ++i) {
|
|
int bit = twp__read_arith_bit(dec, 128);
|
|
result <<= 1;
|
|
result |= bit;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
twp__INLINE static int twp__read_arith_signed_literal(twp__arith_dec *dec, int n)
|
|
{
|
|
twp__assert(n >= 1 && n <= 31);
|
|
int lit = twp__read_arith_literal(dec, n);
|
|
int sign = twp__read_arith_literal(dec, 1);
|
|
return sign ? -lit : lit;
|
|
}
|
|
|
|
twp__INLINE static int twp__read_arith_flagged_signed_literal(twp__arith_dec *dec, int n, int value_if_flag_is_zero)
|
|
{
|
|
int flag = twp__read_arith_literal(dec, 1);
|
|
if (flag)
|
|
return twp__read_arith_signed_literal(dec, n);
|
|
else
|
|
return value_if_flag_is_zero;
|
|
}
|
|
|
|
twp__INLINE static int twp__read_arith_tree(twp__arith_dec *dec, const int *tree, const int *prob, int init)
|
|
{
|
|
// note that assuming that tree and prob are always valid (which they should be),
|
|
// we never have to check if i and i >> 1 are in bounds
|
|
int i = init;
|
|
do {
|
|
int p = prob[i >> 1];
|
|
int b = twp__read_arith_bit(dec, p);
|
|
i = tree[i + b];
|
|
} while (i > 0);
|
|
return -i;
|
|
}
|
|
|
|
twp__INLINE static int twp__get_quant(twp__dct_coeff_type type, int idx)
|
|
{
|
|
if (idx < 0) idx = 0;
|
|
else if (idx >= twp__QUANT_TABLE_SIZE) idx = twp__QUANT_TABLE_SIZE-1;
|
|
return twp__quant_tables[type][idx];
|
|
}
|
|
|
|
static int twp__read_residual_block(twp__arith_dec *dec, twp__dct_type dct_type, short *coeffs,
|
|
twp__coeff_prob_type coeff_probs, uint8_t *nz_left, uint8_t *nz_above, int dc_quant, int ac_quant)
|
|
{
|
|
memset(coeffs, 0, sizeof(*coeffs) * 16);
|
|
|
|
int all_zero = 1;
|
|
|
|
int ctx = 0;
|
|
if (*nz_left) ++ctx;
|
|
if (*nz_above) ++ctx;
|
|
*nz_left = 0;
|
|
*nz_above = 0;
|
|
|
|
for (int i = (dct_type == twp__DCT_TYPE_Y_WITHOUT_DC); i < 16; ++i) {
|
|
twp__assert(ctx >= 0 && ctx <= 2);
|
|
|
|
int *prob = coeff_probs[dct_type][twp__coeff_bands[i]][ctx];
|
|
|
|
if (twp__read_arith_bit(dec, prob[0]) == 0)
|
|
break; // eob
|
|
|
|
// this works because if we just decoded a 0, the first level of the tree (which is eob) is skipped.
|
|
// this is because it doesn't make any sense to decode an eob right after a 0. you should have
|
|
// just decoded an eob instead.
|
|
while (twp__read_arith_bit(dec, prob[1]) == 0) {
|
|
ctx = 0;
|
|
++i;
|
|
if (i == 16)
|
|
return all_zero;
|
|
prob = coeff_probs[dct_type][twp__coeff_bands[i]][ctx];
|
|
}
|
|
|
|
int coeff = twp__read_arith_tree(dec, twp__dct_token_tree, prob, 4);
|
|
|
|
if (coeff >= twp__DCT_RANGE_0) {
|
|
const int *range_prob = twp__dct_range_probs[coeff - twp__DCT_RANGE_0];
|
|
int base = twp__dct_range_base[coeff - twp__DCT_RANGE_0];
|
|
coeff = 0;
|
|
int j = 0;
|
|
for (;;) {
|
|
int p = range_prob[j++];
|
|
if (p == 0)
|
|
break;
|
|
int bit = twp__read_arith_bit(dec, p);
|
|
coeff += coeff + bit;
|
|
}
|
|
coeff += base;
|
|
}
|
|
|
|
int sign = twp__read_arith_literal(dec, 1);
|
|
if (sign)
|
|
coeff = -coeff;
|
|
|
|
*nz_left = 1;
|
|
*nz_above = 1;
|
|
all_zero = 0;
|
|
|
|
coeffs[twp__zigzag[i]] = (short)(coeff * ((i == 0) ? dc_quant : ac_quant));
|
|
|
|
ctx = coeff;
|
|
if (ctx < 0) ctx = -ctx;
|
|
if (ctx > 2) ctx = 2;
|
|
}
|
|
|
|
return all_zero;
|
|
}
|
|
|
|
#ifdef twp__SSE2
|
|
|
|
twp__INLINE static __m128i twp__idct_C(__m128i x)
|
|
{
|
|
// x * (sqrt(2) * cos(pi/8))
|
|
// the same reasoning as int the c version of the function applies, except here we really don't even
|
|
// have a choice because we use mulhi_epi16, so x must be < 1
|
|
return _mm_add_epi16(x, _mm_mulhi_epi16(x, _mm_set1_epi16(20091)));
|
|
}
|
|
|
|
twp__INLINE static __m128i twp__idct_S(__m128i x)
|
|
{
|
|
// x * (sqrt(2) * sin(pi/8))
|
|
// in this case x is < 1, but the resulting fixed-point value does not fit into a
|
|
// signed 16-bit number, so we use the same trick as above
|
|
return _mm_add_epi16(x, _mm_mulhi_epi16(x, _mm_set1_epi16(-30068)));
|
|
}
|
|
|
|
twp__INLINE static void twp__idct_step(__m128i *i0, __m128i *i1, __m128i *i2, __m128i *i3)
|
|
{
|
|
__m128i t0 = _mm_add_epi16(*i0, *i2);
|
|
__m128i t1 = _mm_sub_epi16(*i0, *i2);
|
|
__m128i t2 = _mm_sub_epi16(twp__idct_S(*i1), twp__idct_C(*i3));
|
|
__m128i t3 = _mm_add_epi16(twp__idct_C(*i1), twp__idct_S(*i3));
|
|
*i0 = _mm_add_epi16(t0, t3);
|
|
*i1 = _mm_add_epi16(t1, t2);
|
|
*i2 = _mm_sub_epi16(t1, t2);
|
|
*i3 = _mm_sub_epi16(t0, t3);
|
|
}
|
|
|
|
twp__INLINE static void twp__idct_transpose(__m128i *i0, __m128i *i1, __m128i *i2, __m128i *i3)
|
|
{
|
|
// i0 = 0 1 2 3 .. .. .. ..
|
|
// i1 = 4 5 6 7 .. .. .. ..
|
|
// i2 = 8 9 10 11 .. .. .. ..
|
|
// i3 = 12 13 14 15 .. .. .. ..
|
|
|
|
__m128i t0 = _mm_unpacklo_epi16(*i0, *i1); // 0 4 1 5 2 6 3 7
|
|
__m128i t1 = _mm_unpacklo_epi16(*i2, *i3); // 8 12 9 13 10 14 11 15
|
|
|
|
*i0 = _mm_unpacklo_epi32(t0, t1); // 0 4 8 12 1 5 9 13
|
|
*i1 = _mm_unpackhi_epi64(*i0, *i0); // 1 5 9 13 1 5 9 13
|
|
*i2 = _mm_unpackhi_epi32(t0, t1); // 2 6 10 14 3 7 11 15
|
|
*i3 = _mm_unpackhi_epi64(*i2, *i2); // 3 7 11 15 3 7 11 15
|
|
}
|
|
|
|
twp__INLINE static void twp__idct_write(__m128i xmm, uint8_t *plane_buf, int sb_offset, int stride, int n)
|
|
{
|
|
__m128i tmp;
|
|
int ub_sucks;
|
|
memcpy(&ub_sucks, plane_buf + sb_offset + stride*n, 4);
|
|
tmp = _mm_cvtsi32_si128(ub_sucks);
|
|
tmp = _mm_unpacklo_epi8(tmp, _mm_setzero_si128());
|
|
tmp = _mm_add_epi16(tmp, xmm);
|
|
tmp = _mm_packus_epi16(tmp, tmp); // this does the clamping for free
|
|
ub_sucks = _mm_cvtsi128_si32(tmp);
|
|
memcpy(plane_buf + sb_offset + stride*n, &ub_sucks, 4);
|
|
}
|
|
|
|
#else
|
|
|
|
twp__INLINE static int twp__idct_C(int x)
|
|
{
|
|
// x * (sqrt(2) * cos(pi/8))
|
|
// because sqrt(2) * cos(pi/8) > 1, the fixed-point math can overflow,
|
|
// so we use the fact that x*a = x*(a-1) + x.
|
|
return (x + twp__sra(x * 20091, 16));
|
|
}
|
|
|
|
twp__INLINE static int twp__idct_S(int x)
|
|
{
|
|
// x * (sqrt(2) * sin(pi/8))
|
|
return twp__sra(x * 35468, 16);
|
|
}
|
|
|
|
#endif
|
|
|
|
static void twp__idct(short *input, uint8_t *plane_buf, int mb_offset, int sb_x, int sb_y, int stride)
|
|
{
|
|
// the prediction must already be written into plane_buf when this function is called!
|
|
|
|
int sb_offset = mb_offset + sb_y*4*stride + sb_x*4;
|
|
|
|
#ifndef twp__SSE2
|
|
for (int i = 0; i < 4; i++) {
|
|
short *col = input + i;
|
|
int t0 = col[0*4] + col[2*4];
|
|
int t1 = col[0*4] - col[2*4];
|
|
int t2 = twp__idct_S(col[1*4]) - twp__idct_C(col[3*4]);
|
|
int t3 = twp__idct_C(col[1*4]) + twp__idct_S(col[3*4]);
|
|
col[0*4] = (short)(t0 + t3);
|
|
col[1*4] = (short)(t1 + t2);
|
|
col[2*4] = (short)(t1 - t2);
|
|
col[3*4] = (short)(t0 - t3);
|
|
}
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
short *row = input + i*4;
|
|
int t0 = row[0] + row[2];
|
|
int t1 = row[0] - row[2];
|
|
int t2 = twp__idct_S(row[1]) - twp__idct_C(row[3]);
|
|
int t3 = twp__idct_C(row[1]) + twp__idct_S(row[3]);
|
|
row[0] = (short)(twp__sra(t0 + t3 + 4, 3));
|
|
row[1] = (short)(twp__sra(t1 + t2 + 4, 3));
|
|
row[2] = (short)(twp__sra(t1 - t2 + 4, 3));
|
|
row[3] = (short)(twp__sra(t0 - t3 + 4, 3));
|
|
}
|
|
|
|
short *src_ptr = input;
|
|
uint8_t *dst_ptr = plane_buf + sb_offset;
|
|
for (int i = 0; i < 4; ++i) {
|
|
for (int j = 0; j < 4; ++j) {
|
|
int val = dst_ptr[j] + src_ptr[j];
|
|
dst_ptr[j] = (uint8_t)twp__clamp(val, 0, 255);
|
|
}
|
|
src_ptr += 4;
|
|
dst_ptr += stride;
|
|
}
|
|
#else
|
|
// the biggest coefficient is +-2048, which means we never have to worry about 16-bit overflow here
|
|
|
|
__m128i i0, i1, i2, i3;
|
|
|
|
i0 = _mm_loadl_epi64((__m128i *)(input + 0));
|
|
i1 = _mm_loadl_epi64((__m128i *)(input + 4));
|
|
i2 = _mm_loadl_epi64((__m128i *)(input + 8));
|
|
i3 = _mm_loadl_epi64((__m128i *)(input + 12));
|
|
|
|
twp__idct_step(&i0, &i1, &i2, &i3);
|
|
twp__idct_transpose(&i0, &i1, &i2, &i3);
|
|
twp__idct_step(&i0, &i1, &i2, &i3);
|
|
|
|
i0 = _mm_srai_epi16(_mm_add_epi16(i0, _mm_set1_epi16(4)), 3);
|
|
i1 = _mm_srai_epi16(_mm_add_epi16(i1, _mm_set1_epi16(4)), 3);
|
|
i2 = _mm_srai_epi16(_mm_add_epi16(i2, _mm_set1_epi16(4)), 3);
|
|
i3 = _mm_srai_epi16(_mm_add_epi16(i3, _mm_set1_epi16(4)), 3);
|
|
|
|
twp__idct_transpose(&i0, &i1, &i2, &i3);
|
|
|
|
twp__idct_write(i0, plane_buf, sb_offset, stride, 0);
|
|
twp__idct_write(i1, plane_buf, sb_offset, stride, 1);
|
|
twp__idct_write(i2, plane_buf, sb_offset, stride, 2);
|
|
twp__idct_write(i3, plane_buf, sb_offset, stride, 3);
|
|
#endif
|
|
}
|
|
|
|
static void twp__iwht(short *input)
|
|
{
|
|
// this function should probably be simd'ized, but it doesn't seem to be a hotspot,
|
|
// so i didn't bother (yet)..
|
|
|
|
short *ip = input;
|
|
short *op = input;
|
|
for (int i = 0; i < 4; i++) {
|
|
int a1 = ip[0] + ip[12];
|
|
int b1 = ip[4] + ip[8];
|
|
int c1 = ip[4] - ip[8];
|
|
int d1 = ip[0] - ip[12];
|
|
op[0] = (short)(a1 + b1);
|
|
op[4] = (short)(c1 + d1);
|
|
op[8] = (short)(a1 - b1);
|
|
op[12]= (short)(d1 - c1);
|
|
ip++;
|
|
op++;
|
|
}
|
|
ip = input;
|
|
op = input;
|
|
for (int i = 0;i < 4; i++) {
|
|
int a1 = ip[0] + ip[3];
|
|
int b1 = ip[1] + ip[2];
|
|
int c1 = ip[1] - ip[2];
|
|
int d1 = ip[0] - ip[3];
|
|
int a2 = a1 + b1;
|
|
int b2 = c1 + d1;
|
|
int c2 = a1 - b1;
|
|
int d2 = d1 - c1;
|
|
op[0] = (short)twp__sra(a2+3, 3);
|
|
op[1] = (short)twp__sra(b2+3, 3);
|
|
op[2] = (short)twp__sra(c2+3, 3);
|
|
op[3] = (short)twp__sra(d2+3, 3);
|
|
ip += 4;
|
|
op += 4;
|
|
}
|
|
}
|
|
|
|
twp__INLINE static uint8_t twp__avg2_vp8(uint8_t a, uint8_t b)
|
|
{
|
|
return (uint8_t)((a + b + 1) >> 1);
|
|
}
|
|
|
|
twp__INLINE static uint8_t twp__weighted_avg3(uint8_t a, uint8_t b, uint8_t c)
|
|
{
|
|
return (uint8_t)((a + b + b + c + 2) >> 2);
|
|
}
|
|
|
|
static void twp__predict_luma_subblock(uint8_t *buf, twp__mb_info *mb_info, int sx, int sy, int mb_offset, int stride)
|
|
{
|
|
twp__assert(mb_info->y_mode == twp__MB_MODE_B_PRED);
|
|
|
|
int sb_mode = mb_info->y_sb_modes[sy*4 + sx];
|
|
int sb_offset = mb_offset + sy*4*stride + sx*4;
|
|
|
|
uint8_t left[4];
|
|
for (int i = 0; i < 4; ++i)
|
|
left[i] = buf[sb_offset + i*stride - 1];
|
|
uint8_t *above = buf + sb_offset - stride;
|
|
uint8_t p = above[-1];
|
|
|
|
uint8_t *dst = buf + sb_offset;
|
|
|
|
switch (sb_mode) {
|
|
case twp__SB_MODE_DC_PRED: {
|
|
// this isn't really explained in the spec, and considering how the 16x16 version of DC_PRED works,
|
|
// it's kind of criminal to not explicitly state that they differ completely
|
|
|
|
#ifndef twp__SSE2
|
|
int avg = 0;
|
|
for (int i = 0; i < 4; ++i) {
|
|
avg += left[i];
|
|
avg += above[i];
|
|
}
|
|
avg = (avg + 4) >> 3;
|
|
twp__assert(avg >= 0 && avg <= 255);
|
|
uint8_t avg8 = (uint8_t)avg;
|
|
memset(dst, avg8, 4); dst += stride;
|
|
memset(dst, avg8, 4); dst += stride;
|
|
memset(dst, avg8, 4); dst += stride;
|
|
memset(dst, avg8, 4); dst += stride;
|
|
#else
|
|
int i0;
|
|
int i1;
|
|
memcpy(&i0, left, 4);
|
|
memcpy(&i1, above, 4);
|
|
__m128i l = _mm_cvtsi32_si128(i0);
|
|
__m128i a = _mm_cvtsi32_si128(i1);
|
|
__m128i joined = _mm_unpacklo_epi32(l, a);
|
|
__m128i sum = _mm_sad_epu8(joined, _mm_setzero_si128());
|
|
__m128i avg = _mm_srli_epi16(_mm_add_epi16(sum, _mm_set1_epi16(4)), 3);
|
|
__m128i avg2 = _mm_unpacklo_epi8(avg, avg);
|
|
__m128i avg4 = _mm_unpacklo_epi8(avg2, avg2);
|
|
int res = _mm_cvtsi128_si32(avg4);
|
|
memcpy(dst, &res, 4); dst += stride;
|
|
memcpy(dst, &res, 4); dst += stride;
|
|
memcpy(dst, &res, 4); dst += stride;
|
|
memcpy(dst, &res, 4); dst += stride;
|
|
#endif
|
|
} break;
|
|
|
|
case twp__SB_MODE_TM_PRED: {
|
|
#ifndef twp__SSE2
|
|
for (int y = 0; y < 4; ++y) {
|
|
for (int x = 0; x < 4; ++x) {
|
|
int val = left[y] + above[x] - p;
|
|
if (val < 0) val = 0;
|
|
if (val > 255) val = 255;
|
|
dst[x] = (uint8_t)val;
|
|
}
|
|
dst += stride;
|
|
}
|
|
#else
|
|
int i;
|
|
memcpy(&i, above, 4);
|
|
__m128i a = _mm_cvtsi32_si128(i);
|
|
a = _mm_unpacklo_epi8(a, _mm_setzero_si128());
|
|
__m128i p_ = _mm_set1_epi16(p);
|
|
|
|
for (i = 0; i < 4; ++i) {
|
|
__m128i l = _mm_set1_epi16(left[i]);
|
|
__m128i row = _mm_sub_epi16(_mm_add_epi16(a, l), p_);
|
|
row = _mm_packus_epi16(row, row);
|
|
int res = _mm_cvtsi128_si32(row);
|
|
memcpy(dst, &res, 4);
|
|
dst += stride;
|
|
}
|
|
#endif
|
|
} break;
|
|
|
|
case twp__SB_MODE_VE_PRED: {
|
|
uint8_t row[4];
|
|
for (int x = 0; x < 4; ++x) {
|
|
uint8_t avg = twp__weighted_avg3(above[x-1], above[x], above[x+1]);
|
|
row[x] = avg;
|
|
}
|
|
memcpy(dst, row, 4); dst += stride;
|
|
memcpy(dst, row, 4); dst += stride;
|
|
memcpy(dst, row, 4); dst += stride;
|
|
memcpy(dst, row, 4); dst += stride;
|
|
} break;
|
|
|
|
case twp__SB_MODE_HE_PRED: {
|
|
memset(dst, twp__weighted_avg3(p, left[0], left[1]), 4); dst += stride;
|
|
memset(dst, twp__weighted_avg3(left[0], left[1], left[2]), 4); dst += stride;
|
|
memset(dst, twp__weighted_avg3(left[1], left[2], left[3]), 4); dst += stride;
|
|
memset(dst, twp__weighted_avg3(left[2], left[3], left[3]), 4); dst += stride;
|
|
} break;
|
|
|
|
case twp__SB_MODE_LD_PRED: {
|
|
dst[0*stride+0] = twp__weighted_avg3(above[0], above[1], above[2]);
|
|
dst[0*stride+1] = twp__weighted_avg3(above[1], above[2], above[3]);
|
|
dst[1*stride+0] = dst[0*stride+1];
|
|
dst[0*stride+2] = twp__weighted_avg3(above[2], above[3], above[4]);
|
|
dst[1*stride+1] = dst[0*stride+2];
|
|
dst[2*stride+0] = dst[0*stride+2];
|
|
dst[0*stride+3] = twp__weighted_avg3(above[3], above[4], above[5]);
|
|
dst[1*stride+2] = dst[0*stride+3];
|
|
dst[2*stride+1] = dst[0*stride+3];
|
|
dst[3*stride+0] = dst[0*stride+3];
|
|
dst[1*stride+3] = twp__weighted_avg3(above[4], above[5], above[6]);
|
|
dst[2*stride+2] = dst[1*stride+3];
|
|
dst[3*stride+1] = dst[1*stride+3];
|
|
dst[2*stride+3] = twp__weighted_avg3(above[5], above[6], above[7]);
|
|
dst[3*stride+2] = dst[2*stride+3];
|
|
dst[3*stride+3] = twp__weighted_avg3(above[6], above[7], above[7]);
|
|
} break;
|
|
|
|
case twp__SB_MODE_RD_PRED: {
|
|
dst[stride*3+0] = twp__weighted_avg3(left[3], left[2], left[1]);
|
|
dst[stride*3+1] = twp__weighted_avg3(left[2], left[1], left[0]);
|
|
dst[stride*2+0] = dst[stride*3+1];
|
|
dst[stride*3+2] = twp__weighted_avg3(left[1], left[0], p);
|
|
dst[stride*2+1] = dst[stride*3+2];
|
|
dst[stride*1+0] = dst[stride*3+2];
|
|
dst[stride*3+3] = twp__weighted_avg3(left[0], p, above[0]);
|
|
dst[stride*2+2] = dst[stride*3+3];
|
|
dst[stride*1+1] = dst[stride*3+3];
|
|
dst[stride*0+0] = dst[stride*3+3];
|
|
dst[stride*2+3] = twp__weighted_avg3(p, above[0], above[1]);
|
|
dst[stride*1+2] = dst[stride*2+3];
|
|
dst[stride*0+1] = dst[stride*2+3];
|
|
dst[stride*1+3] = twp__weighted_avg3(above[0], above[1], above[2]);
|
|
dst[stride*0+2] = dst[stride*1+3];
|
|
dst[stride*0+3] = twp__weighted_avg3(above[1], above[2], above[3]);
|
|
} break;
|
|
|
|
case twp__SB_MODE_VR_PRED: {
|
|
dst[stride*3+0] = twp__weighted_avg3(left[2], left[1], left[0]);
|
|
dst[stride*2+0] = twp__weighted_avg3(left[1], left[0], p);
|
|
dst[stride*3+1] = twp__weighted_avg3(left[0], p, above[0]);
|
|
dst[stride*1+0] = dst[stride*3+1];
|
|
dst[stride*2+1] = twp__avg2_vp8(p, above[0]);
|
|
dst[stride*0+0] = dst[stride*2+1];
|
|
dst[stride*3+2] = twp__weighted_avg3(p, above[0], above[1]);
|
|
dst[stride*1+1] = dst[stride*3+2];
|
|
dst[stride*2+2] = twp__avg2_vp8(above[0], above[1]);
|
|
dst[stride*0+1] = dst[stride*2+2];
|
|
dst[stride*3+3] = twp__weighted_avg3(above[0], above[1], above[2]);
|
|
dst[stride*1+2] = dst[stride*3+3];
|
|
dst[stride*2+3] = twp__avg2_vp8(above[1], above[2]);
|
|
dst[stride*0+2] = dst[stride*2+3];
|
|
dst[stride*1+3] = twp__weighted_avg3(above[1], above[2], above[3]);
|
|
dst[stride*0+3] = twp__avg2_vp8(above[2], above[3]);
|
|
} break;
|
|
|
|
case twp__SB_MODE_VL_PRED: {
|
|
dst[stride*0+0] = twp__avg2_vp8(above[0], above[1]);
|
|
dst[stride*1+0] = twp__weighted_avg3(above[0], above[1], above[2]);
|
|
dst[stride*2+0] = twp__avg2_vp8(above[1], above[2]);
|
|
dst[stride*0+1] = dst[stride*2+0];
|
|
dst[stride*1+1] = twp__weighted_avg3(above[1], above[2], above[3]);
|
|
dst[stride*3+0] = dst[stride*1+1];
|
|
dst[stride*2+1] = twp__avg2_vp8(above[2], above[3]);
|
|
dst[stride*0+2] = dst[stride*2+1];
|
|
dst[stride*3+1] = twp__weighted_avg3(above[2], above[3], above[4]);
|
|
dst[stride*1+2] = dst[stride*3+1];
|
|
dst[stride*2+2] = twp__avg2_vp8(above[3], above[4]);
|
|
dst[stride*0+3] = dst[stride*2+2];
|
|
dst[stride*3+2] = twp__weighted_avg3(above[3], above[4], above[5]);
|
|
dst[stride*1+3] = dst[stride*3+2];
|
|
dst[stride*2+3] = twp__weighted_avg3(above[4], above[5], above[6]);
|
|
dst[stride*3+3] = twp__weighted_avg3(above[5], above[6], above[7]);
|
|
} break;
|
|
|
|
case twp__SB_MODE_HD_PRED: {
|
|
dst[stride*3+0] = twp__avg2_vp8(left[3], left[2]);
|
|
dst[stride*3+1] = twp__weighted_avg3(left[3], left[2], left[1]);
|
|
dst[stride*2+0] = twp__avg2_vp8(left[2], left[1]);
|
|
dst[stride*3+2] = dst[stride*2+0];
|
|
dst[stride*2+1] = twp__weighted_avg3(left[2], left[1], left[0]);
|
|
dst[stride*3+3] = dst[stride*2+1];
|
|
dst[stride*2+2] = twp__avg2_vp8(left[1], left[0]);
|
|
dst[stride*1+0] = dst[stride*2+2];
|
|
dst[stride*2+3] = twp__weighted_avg3(left[1], left[0], p);
|
|
dst[stride*1+1] = dst[stride*2+3];
|
|
dst[stride*1+2] = twp__avg2_vp8(left[0], p);
|
|
dst[stride*0+0] = dst[stride*1+2];
|
|
dst[stride*1+3] = twp__weighted_avg3(left[0], p, above[0]);
|
|
dst[stride*0+1] = dst[stride*1+3];
|
|
dst[stride*0+2] = twp__weighted_avg3(p, above[0], above[1]);
|
|
dst[stride*0+3] = twp__weighted_avg3(above[0], above[1], above[2]);
|
|
} break;
|
|
|
|
case twp__SB_MODE_HU_PRED: {
|
|
dst[stride*0+0] = twp__avg2_vp8(left[0], left[1]);
|
|
dst[stride*0+1] = twp__weighted_avg3(left[0], left[1], left[2]);
|
|
dst[stride*0+2] = twp__avg2_vp8(left[1], left[2]);
|
|
dst[stride*1+0] = dst[stride*0+2];
|
|
dst[stride*0+3] = twp__weighted_avg3(left[1], left[2], left[3]);
|
|
dst[stride*1+1] = dst[stride*0+3];
|
|
dst[stride*1+2] = twp__avg2_vp8(left[2], left[3]);
|
|
dst[stride*2+0] = dst[stride*1+2];
|
|
dst[stride*1+3] = twp__weighted_avg3(left[2], left[3], left[3]);
|
|
dst[stride*2+1] = dst[stride*1+3];
|
|
dst[stride*2+2] = left[3];
|
|
dst[stride*2+3] = left[3];
|
|
dst[stride*3+0] = left[3];
|
|
dst[stride*3+1] = left[3];
|
|
dst[stride*3+2] = left[3];
|
|
dst[stride*3+3] = left[3];
|
|
} break;
|
|
|
|
default: {
|
|
twp__assert(0);
|
|
} break;
|
|
}
|
|
}
|
|
|
|
static void twp__predict_macroblock(uint8_t *buf, twp__mb_info *mb_info, int chroma, int mb_offset, int stride)
|
|
{
|
|
int block_size = chroma ? 8 : 16;
|
|
int mode = chroma ? mb_info->uv_mode : mb_info->y_mode;
|
|
|
|
uint8_t left[16];
|
|
for (int i = 0; i < block_size; ++i)
|
|
left[i] = buf[mb_offset + i*stride - 1];
|
|
uint8_t *above = buf + mb_offset - stride;
|
|
uint8_t p = above[-1];
|
|
|
|
uint8_t *dst = buf + mb_offset;
|
|
|
|
switch (mode) {
|
|
case twp__MB_MODE_DC_PRED: {
|
|
int avg = 0;
|
|
if (mb_info->x == 0 && mb_info->y == 0) {
|
|
avg = 128;
|
|
} else if (mb_info->y == 0) {
|
|
for (int i = 0; i < block_size; ++i) {
|
|
avg += left[i];
|
|
}
|
|
avg = (avg + (block_size/2)) / block_size;
|
|
} else if (mb_info->x == 0) {
|
|
for (int i = 0; i < block_size; ++i) {
|
|
avg += above[i];
|
|
}
|
|
avg = (avg + (block_size/2)) / block_size;
|
|
} else {
|
|
for (int i = 0; i < block_size; ++i) {
|
|
avg += above[i];
|
|
avg += left[i];
|
|
}
|
|
avg = (avg + block_size) / (block_size*2);
|
|
}
|
|
twp__assert(avg >= 0 && avg <= 255);
|
|
|
|
uint8_t avg8 = (uint8_t)avg;
|
|
for (int i = 0; i < block_size; ++i) {
|
|
memset(dst, avg8, block_size);
|
|
dst += stride;
|
|
}
|
|
} break;
|
|
|
|
case twp__MB_MODE_V_PRED: {
|
|
for (int i = 0; i < block_size; ++i) {
|
|
memcpy(dst, above, block_size);
|
|
dst += stride;
|
|
}
|
|
} break;
|
|
|
|
case twp__MB_MODE_H_PRED: {
|
|
for (int i = 0; i < block_size; ++i) {
|
|
memset(dst, left[i], block_size);
|
|
dst += stride;
|
|
}
|
|
} break;
|
|
|
|
case twp__MB_MODE_TM_PRED: {
|
|
for (int y = 0; y < block_size; ++y) {
|
|
for (int x = 0; x < block_size; ++x) {
|
|
int val = left[y] + above[x] - p;
|
|
val = twp__clamp(val, 0, 255);
|
|
dst[x] = (uint8_t)val;
|
|
}
|
|
dst += stride;
|
|
}
|
|
} break;
|
|
|
|
case twp__MB_MODE_B_PRED: {
|
|
// this case must be handled differently, because the prediction process of the 4x4 subblocks
|
|
// must be interleaved the with residual decoding of the 4x4 subblocks
|
|
// (and for chroma this doesn't exist anyway)
|
|
twp__assert(0);
|
|
} break;
|
|
|
|
default: {
|
|
twp__assert(0);
|
|
} break;
|
|
}
|
|
}
|
|
|
|
static int twp__read_vp8_header(twp__vp8_data *data, uint8_t *raw_bytes, int num_bytes)
|
|
{
|
|
if (num_bytes < twp__UNCOMPRESSED_VP8_HEADER_SIZE) return 0;
|
|
|
|
int frame_type = raw_bytes[0] & 0x1;
|
|
if (frame_type != 0) return 0; // 0 == keyframe, 1 == interframes, since we are decoding webp we only accept keyframes
|
|
|
|
int show_frame = (raw_bytes[0] >> 4) & 0x1;
|
|
if (show_frame != 1) return 0; // i assume this should always be 1 for keyframes?
|
|
|
|
data->first_partition_size = (((int)raw_bytes[0] >> 5) & 0x7) | ((int)raw_bytes[1] << 3) | ((int)raw_bytes[2] << 11);
|
|
if (data->first_partition_size > num_bytes - twp__UNCOMPRESSED_VP8_HEADER_SIZE) return 0;
|
|
|
|
int magic = (int)raw_bytes[3] | ((int)raw_bytes[4] << 8) | ((int)raw_bytes[5] << 16);
|
|
if (magic != 0x2a019d) return 0;
|
|
|
|
// todo: currently, the upscaling options have no effect, consistent with libwebp's behavior. in the future,
|
|
// it might be worth adding an option to actually upscale the image, rather than just ignoring this setting
|
|
data->width = (int)raw_bytes[6] | (((int)raw_bytes[7] & 0x3f) << 8);
|
|
data->upscaling_x = (twp__upscaling)(raw_bytes[7] >> 6);
|
|
data->height = (int)raw_bytes[8] | (((int)raw_bytes[9] & 0x3f) << 8);
|
|
data->upscaling_y = (twp__upscaling)(raw_bytes[9] >> 6);
|
|
|
|
data->luma_width = twp__div_round_up(data->width, 16) * 16;
|
|
data->luma_height = twp__div_round_up(data->height, 16) * 16;
|
|
data->luma_stride = 1 + data->luma_width + 4;
|
|
data->chroma_width = data->luma_width / 2;
|
|
data->chroma_height = data->luma_height / 2;
|
|
data->chroma_stride = 1 + data->chroma_width;
|
|
data->mbs_per_row = data->luma_width / 16;
|
|
data->mbs_per_col = data->luma_height / 16;
|
|
data->num_mbs = data->mbs_per_row * data->mbs_per_col;
|
|
data->sbs_per_row = data->mbs_per_row * 4;
|
|
data->sbs_per_col = data->mbs_per_col * 4;
|
|
data->num_sbs = data->sbs_per_row * data->sbs_per_col;
|
|
|
|
twp__init_arith_decoder(&data->first_partition_dec, (uint8_t *)raw_bytes + twp__UNCOMPRESSED_VP8_HEADER_SIZE, data->first_partition_size);
|
|
twp__arith_dec *dec = &data->first_partition_dec;
|
|
|
|
int color_space = twp__read_arith_literal(dec, 1);
|
|
if (color_space != 0) return 0; // 0 is the only valid value, 1 is "reserved for future use" (meaning probably never)
|
|
|
|
data->require_clamping = twp__read_arith_literal(dec, 1);
|
|
|
|
// segmentation_enabled is confusing. it doesn't actually tell you whether you have to read segment_ids
|
|
// for the macroblocks; that's being done by update_mb_segmentation_map. so, if segmentation_enabled is
|
|
// true, but update_mb_segmentation_map is false, then you *must not* read segment_ids; instead, all
|
|
// segment_ids are assumed to be 0. note that update_segment_feature_data might have still been true,
|
|
// in which case macroblocks *do not* just have the same behavior as if segmentation_enabled was false.
|
|
// however, if segmentation_enabled is true, but both update_segment_feature_data and update_mb_segmentation_map
|
|
// are false, then i think that would be the same as segmentation_enabled being false.
|
|
|
|
data->segmentation_enabled = twp__read_arith_literal(dec, 1);
|
|
if (data->segmentation_enabled) {
|
|
int update_mb_segmentation_map = twp__read_arith_literal(dec, 1);
|
|
int update_segment_feature_data = twp__read_arith_literal(dec, 1);
|
|
|
|
if (update_segment_feature_data) {
|
|
data->segment_mode = (twp__segment_mode)twp__read_arith_literal(dec, 1);
|
|
|
|
for (int i = 0; i < twp__MAX_SEGMENTS; ++i) {
|
|
if (twp__read_arith_literal(dec, 1))
|
|
data->segments[i].quant_update = twp__read_arith_signed_literal(dec, 7);
|
|
}
|
|
|
|
for (int i = 0; i < twp__MAX_SEGMENTS; ++i) {
|
|
if (twp__read_arith_literal(dec, 1))
|
|
data->segments[i].lf_update = twp__read_arith_signed_literal(dec, 6);
|
|
}
|
|
}
|
|
|
|
if (update_mb_segmentation_map) {
|
|
for (int i = 0; i < twp__arrlen(data->segment_id_tree_probs); ++i) {
|
|
if (twp__read_arith_literal(dec, 1))
|
|
data->segment_id_tree_probs[i] = twp__read_arith_literal(dec, 8);
|
|
}
|
|
} else {
|
|
data->all_mbs_are_segment_0 = 1;
|
|
}
|
|
}
|
|
|
|
data->loop_filter_type = (twp__filter_type)twp__read_arith_literal(dec, 1);
|
|
data->loop_filter_level = twp__read_arith_literal(dec, 6);
|
|
data->loop_filter_sharpness = twp__read_arith_literal(dec, 3);
|
|
|
|
data->lf_adj_enabled = twp__read_arith_literal(dec, 1);
|
|
if (data->lf_adj_enabled) {
|
|
int mode_ref_lf_delta_update = twp__read_arith_literal(dec, 1);
|
|
if (mode_ref_lf_delta_update == 0) {
|
|
// i think this flag is 0 when you want to re-use the data from the last frame, but that doesn't make sense for an image,
|
|
// so my assumption is it should always be 1 for webp?
|
|
return 0;
|
|
}
|
|
|
|
for (int i = 0; i < 4; ++i) {
|
|
int ref_frame_delta_update_flag = twp__read_arith_literal(dec, 1);
|
|
if (ref_frame_delta_update_flag)
|
|
data->lf_adj_ref_frame[i] = twp__read_arith_signed_literal(dec, 6);
|
|
}
|
|
|
|
for (int i = 0; i < 4; ++i) {
|
|
int mb_mode_delta_update_flag = twp__read_arith_literal(dec, 1);
|
|
if (mb_mode_delta_update_flag)
|
|
data->lf_adj_mb_mode[i] = twp__read_arith_signed_literal(dec, 6);
|
|
}
|
|
}
|
|
|
|
int log2_nbr_of_dct_partitions = twp__read_arith_literal(dec, 2);
|
|
data->num_dct_partitions = 1 << log2_nbr_of_dct_partitions;
|
|
twp__assert(data->num_dct_partitions <= twp__MAX_PARITIONS);
|
|
|
|
int y_ac_quant_idx = twp__read_arith_literal(dec, 7);
|
|
int y_dc_quant_idx_delta = twp__read_arith_flagged_signed_literal(dec, 4, 0);
|
|
int y2_dc_quant_idx_delta = twp__read_arith_flagged_signed_literal(dec, 4, 0);
|
|
int y2_ac_quant_idx_delta = twp__read_arith_flagged_signed_literal(dec, 4, 0);
|
|
int uv_dc_quant_idx_delta = twp__read_arith_flagged_signed_literal(dec, 4, 0);
|
|
int uv_ac_quant_idx_delta = twp__read_arith_flagged_signed_literal(dec, 4, 0);
|
|
|
|
data->frame_quant_idx_base = y_ac_quant_idx;
|
|
data->quant_idx_deltas[twp__DCT_PLANE_Y][twp__DC] = y_dc_quant_idx_delta;
|
|
data->quant_idx_deltas[twp__DCT_PLANE_Y][twp__AC] = 0;
|
|
data->quant_idx_deltas[twp__DCT_PLANE_Y2][twp__DC] = y2_dc_quant_idx_delta;
|
|
data->quant_idx_deltas[twp__DCT_PLANE_Y2][twp__AC] = y2_ac_quant_idx_delta;
|
|
data->quant_idx_deltas[twp__DCT_PLANE_U][twp__DC] = uv_dc_quant_idx_delta;
|
|
data->quant_idx_deltas[twp__DCT_PLANE_U][twp__AC] = uv_ac_quant_idx_delta;
|
|
data->quant_idx_deltas[twp__DCT_PLANE_V][twp__DC] = uv_dc_quant_idx_delta;
|
|
data->quant_idx_deltas[twp__DCT_PLANE_V][twp__AC] = uv_ac_quant_idx_delta;
|
|
|
|
// as far as i can tell this doesn't matter for webp images, so it's just ignored
|
|
/*int refresh_entropy_probs = */twp__read_arith_literal(dec, 1);
|
|
|
|
for (int i = 0; i < twp__arrlen(twp__coeff_update_probs); ++i) {
|
|
for (int j = 0; j < twp__arrlen(twp__coeff_update_probs[0]); ++j) {
|
|
for (int k = 0; k < twp__arrlen(twp__coeff_update_probs[0][0]); ++k) {
|
|
for (int l = 0; l < twp__arrlen(twp__coeff_update_probs[0][0][0]); ++l) {
|
|
int coeff_prob_update_flag = twp__read_arith_bit(dec, twp__coeff_update_probs[i][j][k][l]);
|
|
if (coeff_prob_update_flag)
|
|
data->coeff_probs[i][j][k][l] = twp__read_arith_literal(dec, 8);;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// this specifies if skipping macroblocks with only 0's in them is enabled
|
|
// the spec calls this mb_no_skip_coeff, whis makes absolutely no sense at all?
|
|
int enable_mb_skipping = twp__read_arith_literal(dec, 1);
|
|
|
|
// if skipping is enabled, the probability for reading twp__mb_mode.skip (mb_skip_coeff in the spec)
|
|
int skip_mb_flag_prob = 0;
|
|
if (enable_mb_skipping)
|
|
skip_mb_flag_prob = twp__read_arith_literal(dec, 8);
|
|
|
|
// frame header finished, continue with macroblock prediction records
|
|
|
|
data->mb_infos = (twp__mb_info *)calloc(data->num_mbs, sizeof(*data->mb_infos));
|
|
|
|
for (int mb_idx = 0; mb_idx < data->num_mbs; ++mb_idx) {
|
|
twp__mb_info *mb_info = data->mb_infos + mb_idx;
|
|
|
|
int mb_x = mb_idx % data->mbs_per_row;
|
|
int mb_y = mb_idx / data->mbs_per_row;
|
|
mb_info->x = mb_x;
|
|
mb_info->y = mb_y;
|
|
|
|
mb_info->segment_id = 0;
|
|
if (data->segmentation_enabled && !data->all_mbs_are_segment_0)
|
|
mb_info->segment_id = (uint8_t)twp__read_arith_tree(dec, twp__segment_id_tree, data->segment_id_tree_probs, 0);
|
|
|
|
if (enable_mb_skipping)
|
|
mb_info->skip = (uint8_t)twp__read_arith_bit(dec, skip_mb_flag_prob);
|
|
|
|
mb_info->y_mode = (uint8_t)twp__read_arith_tree(dec, twp__mb_y_mode_tree, twp__mb_y_mode_tree_probs, 0);
|
|
if (mb_info->y_mode == twp__MB_MODE_B_PRED) {
|
|
for (int y = 0; y < 4; ++y) {
|
|
for (int x = 0; x < 4; ++x) {
|
|
int abs_x = mb_x*4 + x;
|
|
int abs_y = mb_y*4 + y;
|
|
|
|
int left;
|
|
if (abs_x > 0) {
|
|
if (x == 0)
|
|
left = mb_info[-1].y_sb_modes[y*4 + 3];
|
|
else
|
|
left = mb_info->y_sb_modes[y*4 + (x-1)];
|
|
} else {
|
|
left = twp__SB_MODE_DC_PRED;
|
|
}
|
|
|
|
int above;
|
|
if (abs_y > 0) {
|
|
if (y == 0)
|
|
above = mb_info[-data->mbs_per_row].y_sb_modes[3*4 + x];
|
|
else
|
|
above = mb_info->y_sb_modes[(y-1)*4 + x];
|
|
} else {
|
|
above = twp__SB_MODE_DC_PRED;
|
|
}
|
|
|
|
int sb_mode = twp__read_arith_tree(dec, twp__sb_mode_tree, twp__sb_mode_tree_probs[above][left], 0);
|
|
mb_info->y_sb_modes[y*4 + x] = (uint8_t)sb_mode;
|
|
}
|
|
}
|
|
} else {
|
|
// we also need to set the submodes to make the above/left thing for MODE_B work
|
|
uint8_t sb_mode = 0;
|
|
switch (mb_info->y_mode) {
|
|
case twp__MB_MODE_DC_PRED: sb_mode = twp__SB_MODE_DC_PRED; break;
|
|
case twp__MB_MODE_H_PRED: sb_mode = twp__SB_MODE_HE_PRED; break;
|
|
case twp__MB_MODE_V_PRED: sb_mode = twp__SB_MODE_VE_PRED; break;
|
|
case twp__MB_MODE_TM_PRED: sb_mode = twp__SB_MODE_TM_PRED; break;
|
|
default: sb_mode = 0; twp__assert(0); break;
|
|
}
|
|
for (int i = 0; i < 16; ++i)
|
|
mb_info->y_sb_modes[i] = sb_mode;
|
|
}
|
|
|
|
mb_info->uv_mode = (uint8_t)twp__read_arith_tree(dec, twp__mb_uv_mode_tree, twp__mb_uv_mode_tree_probs, 0);
|
|
}
|
|
|
|
int num_bytes_for_partition_sizes = (data->num_dct_partitions-1) * 3;
|
|
if (twp__UNCOMPRESSED_VP8_HEADER_SIZE + data->first_partition_size + num_bytes_for_partition_sizes > num_bytes)
|
|
return 0;
|
|
|
|
int dct_partitions_start_offset = twp__UNCOMPRESSED_VP8_HEADER_SIZE + data->first_partition_size + num_bytes_for_partition_sizes;
|
|
|
|
int partition_size_sum = 0;
|
|
for (int i = 0; i < data->num_dct_partitions-1; ++i) {
|
|
twp__dct_partition *p = data->dct_partitions + i;
|
|
int partition_size =
|
|
(int)raw_bytes[twp__UNCOMPRESSED_VP8_HEADER_SIZE + data->first_partition_size + i*3 + 0] |
|
|
((int)raw_bytes[twp__UNCOMPRESSED_VP8_HEADER_SIZE + data->first_partition_size + i*3 + 1] << 8) |
|
|
((int)raw_bytes[twp__UNCOMPRESSED_VP8_HEADER_SIZE + data->first_partition_size + i*3 + 2] << 16);
|
|
if (partition_size < 0) return 0;
|
|
if (partition_size >= num_bytes) return 0;
|
|
p->offset = partition_size_sum;
|
|
p->size = partition_size;
|
|
partition_size_sum += partition_size;
|
|
}
|
|
twp__dct_partition *last_p = &data->dct_partitions[data->num_dct_partitions-1];
|
|
last_p->offset = partition_size_sum;
|
|
last_p->size = num_bytes - dct_partitions_start_offset - partition_size_sum;
|
|
if (last_p->size < 1) return 0;
|
|
if (dct_partitions_start_offset + last_p->offset + last_p->size > num_bytes) return 0;
|
|
for (int i = 0; i < data->num_dct_partitions; ++i) {
|
|
twp__dct_partition *p = data->dct_partitions + i;
|
|
twp__init_arith_decoder(&p->dec, raw_bytes + dct_partitions_start_offset + p->offset, p->size);
|
|
}
|
|
|
|
return !dec->err;
|
|
}
|
|
|
|
static void twp__build_quant_table(twp__vp8_data *data, twp__mb_info *mb_info,
|
|
int quant_table[twp__NUM_DCT_PLANES][twp__NUM_DCT_COEFF_TYPES])
|
|
{
|
|
for (int plane = 0; plane < twp__NUM_DCT_PLANES; ++plane) {
|
|
for (int coeff_type = 0; coeff_type < twp__NUM_DCT_COEFF_TYPES; ++coeff_type) {
|
|
int quant_idx_base = data->frame_quant_idx_base;
|
|
if (data->segmentation_enabled) {
|
|
if (data->segment_mode == twp__SEGMENT_MODE_DELTA)
|
|
quant_idx_base += data->segments[mb_info->segment_id].quant_update;
|
|
else if (data->segment_mode == twp__SEGMENT_MODE_ABSOLUTE)
|
|
quant_idx_base = data->segments[mb_info->segment_id].quant_update;
|
|
else
|
|
twp__assert(0);
|
|
}
|
|
|
|
int quant_idx = quant_idx_base + data->quant_idx_deltas[plane][coeff_type];
|
|
int quant = twp__get_quant((twp__dct_coeff_type)coeff_type, quant_idx);
|
|
|
|
if (plane == twp__DCT_PLANE_Y2) {
|
|
if (coeff_type == twp__DC) {
|
|
quant *= 2;
|
|
} else {
|
|
quant = quant * 155 / 100;
|
|
if (quant < 8) quant = 8;
|
|
}
|
|
} else if (plane == twp__DCT_PLANE_U || plane == twp__DCT_PLANE_V) {
|
|
if (coeff_type == twp__DC) {
|
|
if (quant > 132) quant = 132;
|
|
}
|
|
}
|
|
|
|
quant_table[plane][coeff_type] = quant;
|
|
}
|
|
}
|
|
}
|
|
|
|
twp__INLINE static int twp__calc_mb_offset(int x, int y, int stride, int chroma)
|
|
{
|
|
int block_size = chroma ? 8 : 16;
|
|
int mb_offset = ((y*block_size + 1) * stride) + (x*block_size + 1);
|
|
return mb_offset;
|
|
}
|
|
|
|
static int twp__read_yuv_data(twp__vp8_data *data, twp_format format)
|
|
{
|
|
int y_bufsize = data->luma_stride * (data->luma_height+1);
|
|
int uv_bufsize = data->chroma_stride * (data->chroma_height+1);
|
|
int a_bufsize = (format == twp_FORMAT_YUVA) ? (data->width * data->height) : 0;
|
|
data->plane_y = (uint8_t *)malloc(y_bufsize + uv_bufsize*2 + a_bufsize);
|
|
data->plane_u = data->plane_y + y_bufsize;
|
|
data->plane_v = data->plane_u + uv_bufsize;
|
|
|
|
// the spec does not say what the the value of the top left out of bounds value should be.. but it's 127
|
|
// https://codec-devel.webmproject.narkive.com/RBobMxzF/out-of-bound-value-of-p
|
|
for (int i = 0; i < data->luma_stride; ++i) {
|
|
data->plane_y[i] = 127;
|
|
}
|
|
for (int i = 1; i < data->luma_height+1; ++i) {
|
|
data->plane_y[i*data->luma_stride] = 129;
|
|
}
|
|
for (int i = 0; i < data->chroma_stride; ++i) {
|
|
data->plane_u[i] = 127;
|
|
data->plane_v[i] = 127;
|
|
}
|
|
for (int i = 1; i < data->chroma_height+1; ++i) {
|
|
data->plane_u[i*data->chroma_stride] = 129;
|
|
data->plane_v[i*data->chroma_stride] = 129;
|
|
}
|
|
|
|
uint8_t nz_left[twp__NUM_DCT_PLANES][4];
|
|
uint8_t *nz_above[twp__NUM_DCT_PLANES];
|
|
int nz_above_alloc_size = data->mbs_per_row*4 + data->mbs_per_row + data->mbs_per_row*2 + data->mbs_per_row*2; // y + y2 + u + v
|
|
uint8_t *nz_above_alloc = (uint8_t *)calloc(nz_above_alloc_size, sizeof(**nz_above));
|
|
nz_above[twp__DCT_PLANE_Y] = nz_above_alloc;
|
|
nz_above[twp__DCT_PLANE_Y2] = nz_above[twp__DCT_PLANE_Y] + data->mbs_per_row*4;
|
|
nz_above[twp__DCT_PLANE_U] = nz_above[twp__DCT_PLANE_Y2] + data->mbs_per_row;
|
|
nz_above[twp__DCT_PLANE_V] = nz_above[twp__DCT_PLANE_U] + data->mbs_per_row*2;
|
|
|
|
int curr_partition_idx = 0;
|
|
for (int mb_y = 0; mb_y < data->mbs_per_col; ++mb_y) {
|
|
twp__arith_dec *dec = &data->dct_partitions[curr_partition_idx].dec;
|
|
memset(nz_left, 0, sizeof(nz_left));
|
|
|
|
// make sure the the special subblock prediction case handling below works correctly when mb_x == mbs_per_row-1
|
|
memset(data->plane_y + ((mb_y*16) * data->luma_stride) + data->luma_width + 1,
|
|
data->plane_y[((mb_y*16) * data->luma_stride) + data->luma_width],
|
|
4);
|
|
|
|
for (int mb_x = 0; mb_x < data->mbs_per_row; ++mb_x) {
|
|
twp__mb_info *mb_info = &data->mb_infos[mb_y*data->mbs_per_row + mb_x];
|
|
int luma_mb_offset = twp__calc_mb_offset(mb_x, mb_y, data->luma_stride, 0);
|
|
int chroma_mb_offset = twp__calc_mb_offset(mb_x, mb_y, data->chroma_stride, 1);
|
|
int have_y2 = (mb_info->y_mode != twp__MB_MODE_B_PRED);
|
|
|
|
twp__predict_macroblock(data->plane_u, mb_info, 1, chroma_mb_offset, data->chroma_stride);
|
|
twp__predict_macroblock(data->plane_v, mb_info, 1, chroma_mb_offset, data->chroma_stride);
|
|
if (have_y2) {
|
|
// if the mode is B_PRED, we need to fully reconstruct each subblock before going to the next subblock,
|
|
// which means we can't just predict the entire 16x16 block first
|
|
twp__predict_macroblock(data->plane_y, mb_info, 0, luma_mb_offset, data->luma_stride);
|
|
} else {
|
|
// handle special subblock prediction case when sb_x == 3
|
|
uint8_t *pred_row = data->plane_y + ((mb_y*16) * data->luma_stride);
|
|
uint8_t *row_at = pred_row;
|
|
for (int y = 0; y < 4; ++y) {
|
|
row_at += 4*data->luma_stride;
|
|
memcpy(row_at + mb_x*16 + 17, pred_row + mb_x*16 + 17, 4);
|
|
}
|
|
}
|
|
|
|
if (mb_info->skip) {
|
|
if (have_y2) {
|
|
nz_left[twp__DCT_PLANE_Y2][0] = 0;
|
|
nz_above[twp__DCT_PLANE_Y2][mb_x] = 0;
|
|
mb_info->skip_sb_filtering = 1;
|
|
}
|
|
|
|
for (int sb_y = 0; sb_y < 4; ++sb_y) {
|
|
for (int sb_x = 0; sb_x < 4; ++sb_x) {
|
|
nz_left[twp__DCT_PLANE_Y][sb_y] = 0;
|
|
nz_above[twp__DCT_PLANE_Y][mb_x*4 + sb_x] = 0;
|
|
|
|
if (!have_y2)
|
|
twp__predict_luma_subblock(data->plane_y, mb_info, sb_x, sb_y, luma_mb_offset, data->luma_stride);
|
|
}
|
|
}
|
|
|
|
for (int sb_y = 0; sb_y < 2; ++sb_y) {
|
|
for (int sb_x = 0; sb_x < 2; ++sb_x) {
|
|
nz_left[twp__DCT_PLANE_U][sb_y] = 0;
|
|
nz_left[twp__DCT_PLANE_V][sb_y] = 0;
|
|
nz_above[twp__DCT_PLANE_U][mb_x*2 + sb_x] = 0;
|
|
nz_above[twp__DCT_PLANE_V][mb_x*2 + sb_x] = 0;
|
|
}
|
|
}
|
|
|
|
continue;
|
|
}
|
|
|
|
// we could build the quant table outside of the loop if this ever becomes a hotspot..
|
|
int quant_table[twp__NUM_DCT_PLANES][twp__NUM_DCT_COEFF_TYPES];
|
|
twp__build_quant_table(data, mb_info, quant_table);
|
|
|
|
int all_zero = 1;
|
|
short dc_coeffs[16];
|
|
short residue[16];
|
|
|
|
if (have_y2) {
|
|
all_zero &= twp__read_residual_block(dec, twp__DCT_TYPE_Y2, dc_coeffs, data->coeff_probs,
|
|
&nz_left[twp__DCT_PLANE_Y2][0], &nz_above[twp__DCT_PLANE_Y2][mb_x],
|
|
quant_table[twp__DCT_PLANE_Y2][twp__DC], quant_table[twp__DCT_PLANE_Y2][twp__AC]);
|
|
twp__iwht(dc_coeffs);
|
|
}
|
|
|
|
for (int sb_y = 0; sb_y < 4; ++sb_y) {
|
|
for (int sb_x = 0; sb_x < 4; ++sb_x) {
|
|
all_zero &= twp__read_residual_block(dec, have_y2 ? twp__DCT_TYPE_Y_WITHOUT_DC : twp__DCT_TYPE_Y_WITH_DC,
|
|
residue, data->coeff_probs, &nz_left[twp__DCT_PLANE_Y][sb_y], &nz_above[twp__DCT_PLANE_Y][sb_x + mb_x*4],
|
|
quant_table[twp__DCT_PLANE_Y][twp__DC], quant_table[twp__DCT_PLANE_Y][twp__AC]);
|
|
if (!have_y2)
|
|
twp__predict_luma_subblock(data->plane_y, mb_info, sb_x, sb_y, luma_mb_offset, data->luma_stride);
|
|
else
|
|
residue[0] = dc_coeffs[sb_y*4 + sb_x];
|
|
|
|
twp__idct(residue, data->plane_y, luma_mb_offset, sb_x, sb_y, data->luma_stride);
|
|
}
|
|
}
|
|
|
|
for (int sb_y = 0; sb_y < 2; ++sb_y) {
|
|
for (int sb_x = 0; sb_x < 2; ++sb_x) {
|
|
all_zero &= twp__read_residual_block(dec, twp__DCT_TYPE_UV, residue, data->coeff_probs,
|
|
&nz_left[twp__DCT_PLANE_U][sb_y], &nz_above[twp__DCT_PLANE_U][sb_x + mb_x*2],
|
|
quant_table[twp__DCT_PLANE_U][twp__DC], quant_table[twp__DCT_PLANE_U][twp__AC]);
|
|
twp__idct(residue, data->plane_u, chroma_mb_offset, sb_x, sb_y, data->chroma_stride);
|
|
}
|
|
}
|
|
|
|
for (int sb_y = 0; sb_y < 2; ++sb_y) {
|
|
for (int sb_x = 0; sb_x < 2; ++sb_x) {
|
|
all_zero &= twp__read_residual_block(dec, twp__DCT_TYPE_UV, residue, data->coeff_probs,
|
|
&nz_left[twp__DCT_PLANE_V][sb_y], &nz_above[twp__DCT_PLANE_V][sb_x + mb_x*2],
|
|
quant_table[twp__DCT_PLANE_V][twp__DC], quant_table[twp__DCT_PLANE_V][twp__AC]);
|
|
twp__idct(residue, data->plane_v, chroma_mb_offset, sb_x, sb_y, data->chroma_stride);
|
|
}
|
|
}
|
|
|
|
mb_info->skip_sb_filtering = (all_zero && have_y2);
|
|
}
|
|
|
|
++curr_partition_idx;
|
|
if (curr_partition_idx >= data->num_dct_partitions)
|
|
curr_partition_idx = 0;
|
|
}
|
|
|
|
free(nz_above_alloc);
|
|
|
|
for (int i = 0; i < data->num_dct_partitions; ++i) {
|
|
if (data->dct_partitions[i].dec.err)
|
|
return 0;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
static int twp__get_filter_level(twp__vp8_data *data, twp__mb_info *mb_info)
|
|
{
|
|
// how this works is not explained in the spec at all. you just have to decipher their shitty c code
|
|
|
|
int filter_level = data->loop_filter_level;
|
|
if (filter_level == 0) return 0; // if 0 at the frame level, filtering should be skipped
|
|
|
|
if (data->segmentation_enabled) {
|
|
if (data->segment_mode == twp__SEGMENT_MODE_ABSOLUTE)
|
|
filter_level = data->segments[mb_info->segment_id].lf_update;
|
|
else if (data->segment_mode == twp__SEGMENT_MODE_DELTA)
|
|
filter_level += data->segments[mb_info->segment_id].lf_update;
|
|
else
|
|
twp__assert(0);
|
|
}
|
|
filter_level = twp__clamp(filter_level, 0, 63);
|
|
|
|
if (data->lf_adj_enabled) {
|
|
filter_level += data->lf_adj_ref_frame[0];
|
|
if (mb_info->y_mode == twp__MB_MODE_B_PRED)
|
|
filter_level += data->lf_adj_mb_mode[0];
|
|
}
|
|
filter_level = twp__clamp(filter_level, 0, 63);
|
|
|
|
return filter_level;
|
|
}
|
|
|
|
static int twp__get_interior_limit(int filter_level, int sharpness)
|
|
{
|
|
// the maximum value of this is the same as filter_level, meaning 63
|
|
|
|
int interior_limit = filter_level;
|
|
if (sharpness != 0) {
|
|
if (sharpness > 4)
|
|
interior_limit >>= 2;
|
|
else
|
|
interior_limit >>= 1;
|
|
|
|
int max_interior_limit = 9 - sharpness;
|
|
if (interior_limit > max_interior_limit)
|
|
interior_limit = max_interior_limit;
|
|
}
|
|
if (interior_limit < 0)
|
|
interior_limit = 0;
|
|
|
|
return interior_limit;
|
|
}
|
|
|
|
static int twp__get_hev_threshold(int filter_level)
|
|
{
|
|
int hev_threshold = 0;
|
|
if (filter_level >= 15) ++hev_threshold;
|
|
if (filter_level >= 40) ++hev_threshold;
|
|
return hev_threshold;
|
|
}
|
|
|
|
static void twp__get_edge_limits(int filter_level, int interior_limit, int *mb, int *sb)
|
|
{
|
|
// maximum values:
|
|
// 193 for mb
|
|
// 189 for sb
|
|
*mb = ((filter_level + 2) * 2) + interior_limit;
|
|
*sb = (filter_level * 2) + interior_limit;
|
|
}
|
|
|
|
#ifndef twp__SSE2
|
|
|
|
twp__INLINE static int twp__lf_clamp(int val)
|
|
{
|
|
return twp__clamp(val, -128, 127);
|
|
}
|
|
|
|
twp__INLINE static int twp__lf_u2s(int val)
|
|
{
|
|
return val - 128;
|
|
}
|
|
|
|
twp__INLINE static uint8_t twp__lf_s2u(int val)
|
|
{
|
|
return (uint8_t)(twp__lf_clamp(val) + 128);
|
|
}
|
|
|
|
static int twp__simple_threshold(int edge_limit, int p1, int p0, int q0, int q1)
|
|
{
|
|
return (twp__abs(p0 - q0)*2 + twp__abs(p1 - q1)/2) <= edge_limit;
|
|
}
|
|
|
|
static int twp__is_hev(int hev_threshold, int p1, int p0, int q0, int q1)
|
|
{
|
|
return (twp__abs(p1 - p0) > hev_threshold) || (twp__abs(q1 - q0) > hev_threshold);
|
|
}
|
|
|
|
static int twp__normal_threshold(int edge_limit, int interior_limit,
|
|
int p3, int p2, int p1, int p0, int q0, int q1, int q2, int q3)
|
|
{
|
|
return (twp__simple_threshold(edge_limit, p1, p0, q0, q1)) &&
|
|
(twp__abs(p3 - p2) <= interior_limit) &&
|
|
(twp__abs(p2 - p1) <= interior_limit) &&
|
|
(twp__abs(p1 - p0) <= interior_limit) &&
|
|
(twp__abs(q0 - q1) <= interior_limit) &&
|
|
(twp__abs(q1 - q2) <= interior_limit) &&
|
|
(twp__abs(q2 - q3) <= interior_limit);
|
|
}
|
|
|
|
static int twp__filter_common(int use_outer_taps, int p1, int *p0, int *q0, int q1)
|
|
{
|
|
int a;
|
|
if (use_outer_taps)
|
|
a = twp__lf_clamp(p1 - q1) + 3*(*q0 - *p0);
|
|
else
|
|
a = 3*(*q0 - *p0);
|
|
a = twp__lf_clamp(a);
|
|
|
|
int b = twp__sra(twp__lf_clamp(a + 3), 3);
|
|
|
|
int c = twp__sra(twp__lf_clamp(a + 4), 3);
|
|
|
|
*q0 = twp__lf_clamp(*q0 - c);
|
|
*p0 = twp__lf_clamp(*p0 + b);
|
|
|
|
return c;
|
|
}
|
|
|
|
static void twp__normal_filter_get_pixels(uint8_t *plane, int stride, int offset, int vert, int i,
|
|
uint8_t **p3_ptr, uint8_t **p2_ptr, uint8_t **p1_ptr, uint8_t **p0_ptr,
|
|
uint8_t **q0_ptr, uint8_t **q1_ptr, uint8_t **q2_ptr, uint8_t **q3_ptr,
|
|
int *p3, int *p2, int *p1, int *p0, int *q0, int *q1, int *q2, int *q3)
|
|
{
|
|
if (vert)
|
|
*p3_ptr = &plane[offset - 4*stride + i];
|
|
else
|
|
*p3_ptr = &plane[offset + i*stride - 4];
|
|
*p2_ptr = *p3_ptr + (vert ? 1*stride : 1);
|
|
*p1_ptr = *p3_ptr + (vert ? 2*stride : 2);
|
|
*p0_ptr = *p3_ptr + (vert ? 3*stride : 3);
|
|
*q0_ptr = *p3_ptr + (vert ? 4*stride : 4);
|
|
*q1_ptr = *p3_ptr + (vert ? 5*stride : 5);
|
|
*q2_ptr = *p3_ptr + (vert ? 6*stride : 6);
|
|
*q3_ptr = *p3_ptr + (vert ? 7*stride : 7);
|
|
|
|
*p3 = twp__lf_u2s(**p3_ptr);
|
|
*p2 = twp__lf_u2s(**p2_ptr);
|
|
*p1 = twp__lf_u2s(**p1_ptr);
|
|
*p0 = twp__lf_u2s(**p0_ptr);
|
|
*q0 = twp__lf_u2s(**q0_ptr);
|
|
*q1 = twp__lf_u2s(**q1_ptr);
|
|
*q2 = twp__lf_u2s(**q2_ptr);
|
|
*q3 = twp__lf_u2s(**q3_ptr);
|
|
}
|
|
|
|
static void twp__normal_filter_sb_(int edge_limit, int interior_limit, int hev_threshold,
|
|
uint8_t *plane, int stride, int mb_offset, int vert, int chroma)
|
|
{
|
|
for (int j = 0; j < (chroma ? 1 : 3); ++j) {
|
|
int sb_offset = mb_offset;
|
|
if (vert)
|
|
sb_offset += (j+1)*4*stride;
|
|
else
|
|
sb_offset += (j+1)*4;
|
|
|
|
for (int i = 0; i < (chroma ? 8 : 16); ++i) {
|
|
uint8_t *p3_ptr, *p2_ptr, *p1_ptr, *p0_ptr, *q0_ptr, *q1_ptr, *q2_ptr, *q3_ptr;
|
|
int p3, p2, p1, p0, q0, q1, q2, q3;
|
|
twp__normal_filter_get_pixels(plane, stride, sb_offset, vert, i,
|
|
&p3_ptr, &p2_ptr, &p1_ptr, &p0_ptr,
|
|
&q0_ptr, &q1_ptr, &q2_ptr, &q3_ptr,
|
|
&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
|
|
|
if (!twp__normal_threshold(edge_limit, interior_limit, p3, p2, p1, p0, q0, q1, q2, q3))
|
|
continue;
|
|
|
|
int hev = twp__is_hev(hev_threshold, p1, p0, q0, q1);
|
|
int a = twp__filter_common(hev, p1, &p0, &q0, q1);
|
|
*p0_ptr = twp__lf_s2u(p0);
|
|
*q0_ptr = twp__lf_s2u(q0);
|
|
a = twp__sra((a + 1), 1);
|
|
if (!hev) {
|
|
*q1_ptr = twp__lf_s2u(q1 - a);
|
|
*p1_ptr = twp__lf_s2u(p1 + a);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static void twp__normal_filter_sb(int edge_limit, int interior_limit, int hev_threshold,
|
|
uint8_t *plane0, uint8_t *plane1, int stride, int mb_offset, int vert)
|
|
{
|
|
// this function makes it so the simd version has the same function signature, which means we don't
|
|
// need a bunch of ifdefs
|
|
if (plane0 && plane1) {
|
|
twp__normal_filter_sb_(edge_limit, interior_limit, hev_threshold, plane0, stride, mb_offset, vert, 1);
|
|
twp__normal_filter_sb_(edge_limit, interior_limit, hev_threshold, plane1, stride, mb_offset, vert, 1);
|
|
} else {
|
|
twp__normal_filter_sb_(edge_limit, interior_limit, hev_threshold, plane0, stride, mb_offset, vert, 0);
|
|
}
|
|
}
|
|
|
|
static void twp__normal_filter_mb_(int edge_limit, int interior_limit, int hev_threshold,
|
|
uint8_t *plane, int stride, int mb_offset, int vert, int chroma)
|
|
{
|
|
for (int i = 0; i < (chroma ? 8 : 16); ++i) {
|
|
uint8_t *p3_ptr, *p2_ptr, *p1_ptr, *p0_ptr, *q0_ptr, *q1_ptr, *q2_ptr, *q3_ptr;
|
|
int p3, p2, p1, p0, q0, q1, q2, q3;
|
|
twp__normal_filter_get_pixels(plane, stride, mb_offset, vert, i,
|
|
&p3_ptr, &p2_ptr, &p1_ptr, &p0_ptr,
|
|
&q0_ptr, &q1_ptr, &q2_ptr, &q3_ptr,
|
|
&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
|
|
|
if (!twp__normal_threshold(edge_limit, interior_limit, p3, p2, p1, p0, q0, q1, q2, q3))
|
|
continue;
|
|
|
|
if (twp__is_hev(hev_threshold, p1, p0, q0, q1)) {
|
|
twp__filter_common(1, p1, &p0, &q0, q1);
|
|
*p0_ptr = twp__lf_s2u(p0);
|
|
*q0_ptr = twp__lf_s2u(q0);
|
|
} else {
|
|
int w = twp__lf_clamp(twp__lf_clamp(p1 - q1) + 3*(q0 - p0));
|
|
|
|
int a = twp__lf_clamp(twp__sra(27*w + 63, 7));
|
|
*q0_ptr = twp__lf_s2u(q0 - a);
|
|
*p0_ptr = twp__lf_s2u(p0 + a);
|
|
|
|
int b = twp__lf_clamp(twp__sra(18*w + 63, 7));
|
|
*q1_ptr = twp__lf_s2u(q1 - b);
|
|
*p1_ptr = twp__lf_s2u(p1 + b);
|
|
|
|
int c = twp__lf_clamp(twp__sra(9*w + 63, 7));
|
|
*q2_ptr = twp__lf_s2u(q2 - c);
|
|
*p2_ptr = twp__lf_s2u(p2 + c);
|
|
}
|
|
}
|
|
}
|
|
|
|
static void twp__normal_filter_mb(int edge_limit, int interior_limit, int hev_threshold,
|
|
uint8_t *plane0, uint8_t *plane1, int stride, int mb_offset, int vert)
|
|
{
|
|
// this function makes it so the simd version has the same function signature, which means we don't
|
|
// need a bunch of ifdefs
|
|
if (plane0 && plane1) {
|
|
twp__normal_filter_mb_(edge_limit, interior_limit, hev_threshold, plane0, stride, mb_offset, vert, 1);
|
|
twp__normal_filter_mb_(edge_limit, interior_limit, hev_threshold, plane1, stride, mb_offset, vert, 1);
|
|
} else {
|
|
twp__normal_filter_mb_(edge_limit, interior_limit, hev_threshold, plane0, stride, mb_offset, vert, 0);
|
|
}
|
|
}
|
|
|
|
static void twp__simple_filter_mb(int edge_limit, uint8_t *plane, int stride, int mb_offset, int vert)
|
|
{
|
|
for (int i = 0; i < 16; ++i) {
|
|
uint8_t *p1_ptr, *p0_ptr, *q0_ptr, *q1_ptr;
|
|
if (vert)
|
|
p1_ptr = &plane[mb_offset - 2*stride + i];
|
|
else
|
|
p1_ptr = &plane[mb_offset + i*stride - 2];
|
|
p0_ptr = p1_ptr + (vert ? 1*stride : 1);
|
|
q0_ptr = p1_ptr + (vert ? 2*stride : 2);
|
|
q1_ptr = p1_ptr + (vert ? 3*stride : 3);
|
|
|
|
int p1 = twp__lf_u2s(*p1_ptr);
|
|
int p0 = twp__lf_u2s(*p0_ptr);
|
|
int q0 = twp__lf_u2s(*q0_ptr);
|
|
int q1 = twp__lf_u2s(*q1_ptr);
|
|
if (twp__simple_threshold(edge_limit, p1, p0, q0, q1)) {
|
|
twp__filter_common(1, p1, &p0, &q0, q1);
|
|
*p0_ptr = twp__lf_s2u(p0);
|
|
*q0_ptr = twp__lf_s2u(q0);
|
|
}
|
|
}
|
|
}
|
|
|
|
#else
|
|
|
|
twp__INLINE static __m128i twp__arith_right_shift_bytes_by_3(__m128i val)
|
|
{
|
|
__m128i t0 = _mm_unpacklo_epi8(_mm_setzero_si128(), val);
|
|
__m128i t1 = _mm_unpackhi_epi8(_mm_setzero_si128(), val);
|
|
__m128i t2 = _mm_srai_epi16(t0, 11);
|
|
__m128i t3 = _mm_srai_epi16(t1, 11);
|
|
__m128i res = _mm_packs_epi16(t2, t3);
|
|
return res;
|
|
}
|
|
|
|
twp__INLINE static __m128i twp__logical_right_shift_bytes_by_1(__m128i val)
|
|
{
|
|
__m128i lsb_mask = _mm_set1_epi8(-2); // 0b11111110
|
|
__m128i tmp = _mm_and_si128(val, lsb_mask);
|
|
__m128i res = _mm_srli_epi16(tmp, 1);
|
|
return res;
|
|
}
|
|
|
|
twp__INLINE static void twp__transpose_bytes_16x8(__m128i *i0, __m128i *i1, __m128i *i2, __m128i *i3,
|
|
__m128i *i4, __m128i *i5, __m128i *i6, __m128i *i7)
|
|
{
|
|
// i0 = 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
|
// i1 = 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
|
|
// i2 = 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
|
|
// i3 = 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
|
// i4 = 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
|
|
// i5 = 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
|
|
// i6 = 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
|
|
// i7 = 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
|
|
|
|
__m128i t0 = _mm_unpacklo_epi8(*i0, *i1); // 0 16 1 17 2 18 3 19 4 20 5 21 6 22 7 23
|
|
__m128i t1 = _mm_unpacklo_epi8(*i2, *i3); // 32 48 33 49 34 50 35 51 36 52 37 53 38 54 39 55
|
|
__m128i t2 = _mm_unpacklo_epi8(*i4, *i5); // 64 80 65 81 66 82 67 83 68 84 69 85 70 86 71 87
|
|
__m128i t3 = _mm_unpacklo_epi8(*i6, *i7); // 96 112 97 113 98 114 99 115 100 116 101 117 102 118 103 119
|
|
__m128i t4 = _mm_unpackhi_epi8(*i0, *i1); // 8 24 9 25 10 26 11 27 12 28 13 29 14 30 15 31
|
|
__m128i t5 = _mm_unpackhi_epi8(*i2, *i3); // 40 56 41 57 42 58 43 59 44 60 45 61 46 62 47 63
|
|
__m128i t6 = _mm_unpackhi_epi8(*i4, *i5); // 72 88 73 89 74 90 75 91 76 92 77 93 78 94 79 95
|
|
__m128i t7 = _mm_unpackhi_epi8(*i6, *i7); // 104 120 105 121 106 122 107 123 108 124 109 125 110 126 111 127
|
|
|
|
__m128i t8 = _mm_unpacklo_epi8(t0, t4); // 0 8 16 24 1 9 17 25 2 10 18 26 3 11 19 27
|
|
__m128i t9 = _mm_unpacklo_epi8(t1, t5); // 32 40 48 56 33 41 49 57 34 42 50 58 35 43 51 59
|
|
__m128i t10 = _mm_unpacklo_epi8(t2, t6); // 64 72 80 88 65 73 81 89 66 74 82 90 67 75 83 91
|
|
__m128i t11 = _mm_unpacklo_epi8(t3, t7); // 96 104 112 120 97 105 113 121 98 106 114 122 99 107 115 123
|
|
__m128i t12 = _mm_unpackhi_epi8(t0, t4); // 4 12 20 28 5 13 21 29 6 14 22 30 7 15 23 31
|
|
__m128i t13 = _mm_unpackhi_epi8(t1, t5); // 36 44 52 60 37 45 53 61 38 46 54 62 39 47 55 63
|
|
__m128i t14 = _mm_unpackhi_epi8(t2, t6); // 68 76 84 92 69 77 85 93 70 78 86 94 71 79 87 95
|
|
__m128i t15 = _mm_unpackhi_epi8(t3, t7); // 100 108 116 124 101 109 117 125 102 110 118 126 103 111 119 127
|
|
|
|
__m128i t16 = _mm_unpacklo_epi32(t8, t9); // 0 8 16 24 32 40 48 56 1 9 17 25 33 41 49 57
|
|
__m128i t17 = _mm_unpacklo_epi32(t10, t11); // 64 72 80 88 96 104 112 120 65 73 81 89 97 105 113 121
|
|
__m128i t18 = _mm_unpacklo_epi32(t12, t13); // 4 12 20 28 36 44 52 60 5 13 21 29 37 45 53 61
|
|
__m128i t19 = _mm_unpacklo_epi32(t14, t15); // 68 76 84 92 100 108 116 124 69 77 85 93 101 109 117 125
|
|
__m128i t20 = _mm_unpackhi_epi32(t8, t9); // 2 10 18 26 34 42 50 58 3 11 19 27 35 43 51 59
|
|
__m128i t21 = _mm_unpackhi_epi32(t10, t11); // 66 74 82 90 98 106 114 122 67 75 83 91 99 107 115 123
|
|
__m128i t22 = _mm_unpackhi_epi32(t12, t13); // 6 14 22 30 38 46 54 62 7 15 23 31 39 47 55 63
|
|
__m128i t23 = _mm_unpackhi_epi32(t14, t15); // 70 78 86 94 102 110 118 126 71 79 87 95 103 111 119 127
|
|
|
|
*i0 = _mm_unpacklo_epi64(t16, t17); // 0 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120
|
|
*i1 = _mm_unpackhi_epi64(t16, t17); // 1 9 17 25 33 41 49 57 65 73 81 89 97 105 113 121
|
|
*i2 = _mm_unpacklo_epi64(t20, t21); // 2 10 18 26 34 42 50 58 66 74 82 90 98 106 114 122
|
|
*i3 = _mm_unpackhi_epi64(t20, t21); // 3 11 19 27 35 43 51 59 67 75 83 91 99 107 115 123
|
|
*i4 = _mm_unpacklo_epi64(t18, t19); // 4 12 20 28 36 44 52 60 68 76 84 92 100 108 116 124
|
|
*i5 = _mm_unpackhi_epi64(t18, t19); // 5 13 21 29 37 45 53 61 69 77 85 93 101 109 117 125
|
|
*i6 = _mm_unpacklo_epi64(t22, t23); // 6 14 22 30 38 46 54 62 70 78 86 94 102 110 118 126
|
|
*i7 = _mm_unpackhi_epi64(t22, t23); // 7 15 23 31 39 47 55 63 71 79 87 95 103 111 119 127
|
|
}
|
|
|
|
twp__INLINE static void twp__transpose_bytes_16x8_reverse(__m128i *i0, __m128i *i1, __m128i *i2, __m128i *i3,
|
|
__m128i *i4, __m128i *i5, __m128i *i6, __m128i *i7)
|
|
{
|
|
// i0 = 0 8 16 24 32 40 48 56 64 72 80 88 96 104 112 120
|
|
// i1 = 1 9 17 25 33 41 49 57 65 73 81 89 97 105 113 121
|
|
// i2 = 2 10 18 26 34 42 50 58 66 74 82 90 98 106 114 122
|
|
// i3 = 3 11 19 27 35 43 51 59 67 75 83 91 99 107 115 123
|
|
// i4 = 4 12 20 28 36 44 52 60 68 76 84 92 100 108 116 124
|
|
// i5 = 5 13 21 29 37 45 53 61 69 77 85 93 101 109 117 125
|
|
// i6 = 6 14 22 30 38 46 54 62 70 78 86 94 102 110 118 126
|
|
// i7 = 7 15 23 31 39 47 55 63 71 79 87 95 103 111 119 127
|
|
|
|
__m128i t0 = _mm_unpacklo_epi8(*i0, *i1); // 0 1 8 9 16 17 24 25 32 33 40 41 48 49 56 57
|
|
__m128i t1 = _mm_unpacklo_epi8(*i2, *i3); // 2 3 10 11 18 19 26 27 34 35 42 43 50 51 58 59
|
|
__m128i t2 = _mm_unpacklo_epi8(*i4, *i5); // 4 5 12 13 20 21 28 29 36 37 44 45 52 53 60 61
|
|
__m128i t3 = _mm_unpacklo_epi8(*i6, *i7); // 6 7 14 15 22 23 30 31 38 39 46 47 54 55 62 63
|
|
__m128i t4 = _mm_unpackhi_epi8(*i0, *i1); // 64 65 72 73 80 81 88 89 96 97 104 105 112 113 120 121
|
|
__m128i t5 = _mm_unpackhi_epi8(*i2, *i3); // 66 67 74 75 82 83 90 91 98 99 106 107 114 115 122 123
|
|
__m128i t6 = _mm_unpackhi_epi8(*i4, *i5); // 68 69 76 77 84 85 92 93 100 101 108 109 116 117 124 125
|
|
__m128i t7 = _mm_unpackhi_epi8(*i6, *i7); // 70 71 78 79 86 87 94 95 102 103 110 111 118 119 126 127
|
|
|
|
__m128i t8 = _mm_unpacklo_epi16(t0, t1); // 0 1 2 3 8 9 10 11 16 17 18 19 24 25 26 27
|
|
__m128i t9 = _mm_unpacklo_epi16(t4, t5); // 64 65 66 67 72 73 74 75 80 81 82 83 88 89 90 91
|
|
__m128i t10 = _mm_unpacklo_epi16(t2, t3); // 4 5 6 7 12 13 14 15 20 21 22 23 28 29 30 31
|
|
__m128i t11 = _mm_unpacklo_epi16(t6, t7); // 68 69 70 71 76 77 78 79 84 85 86 87 92 93 94 95
|
|
__m128i t12 = _mm_unpackhi_epi16(t0, t1); // 32 33 34 35 40 41 42 43 48 49 50 51 56 57 58 59
|
|
__m128i t13 = _mm_unpackhi_epi16(t4, t5); // 96 97 98 99 104 105 106 107 112 113 114 115 120 121 122 123
|
|
__m128i t14 = _mm_unpackhi_epi16(t2, t3); // 36 37 38 39 44 45 46 47 52 53 54 55 60 61 62 63
|
|
__m128i t15 = _mm_unpackhi_epi16(t6, t7); // 100 101 102 103 108 109 110 111 116 117 118 119 124 125 126 127
|
|
|
|
*i0 = _mm_unpacklo_epi32(t8, t10); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
|
*i1 = _mm_unpackhi_epi32(t8, t10); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
|
|
*i2 = _mm_unpacklo_epi32(t12, t14); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
|
|
*i3 = _mm_unpackhi_epi32(t12, t14); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
|
*i4 = _mm_unpacklo_epi32(t9, t11); // 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
|
|
*i5 = _mm_unpackhi_epi32(t9, t11); // 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
|
|
*i6 = _mm_unpacklo_epi32(t13, t15); // 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111
|
|
*i7 = _mm_unpackhi_epi32(t13, t15); // 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
|
|
}
|
|
|
|
twp__INLINE static void twp__load_pixels_normal(uint8_t *plane0, uint8_t *plane1, int stride, int offset, int vert,
|
|
__m128i *p3, __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q2, __m128i *q3)
|
|
{
|
|
int chroma = (plane0 && plane1);
|
|
if (chroma) {
|
|
// combine u and v into one. since chroma macroblocks are only 8x8,
|
|
// we can fit both u and v into one sse register
|
|
|
|
uint8_t *ptr = vert ? &plane0[offset - 4*stride] : &plane0[offset - 4];
|
|
*p3 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*p2 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*p1 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*p0 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*q0 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*q1 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*q2 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*q3 = _mm_loadl_epi64((__m128i *)ptr);
|
|
|
|
ptr = vert ? &plane1[offset - 4*stride] : &plane1[offset - 4];
|
|
*p3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*p3), (double *)ptr)); ptr += stride;
|
|
*p2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*p2), (double *)ptr)); ptr += stride;
|
|
*p1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*p1), (double *)ptr)); ptr += stride;
|
|
*p0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*p0), (double *)ptr)); ptr += stride;
|
|
*q0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*q0), (double *)ptr)); ptr += stride;
|
|
*q1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*q1), (double *)ptr)); ptr += stride;
|
|
*q2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*q2), (double *)ptr)); ptr += stride;
|
|
*q3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*q3), (double *)ptr));
|
|
|
|
if (!vert)
|
|
twp__transpose_bytes_16x8(p3, p2, p1, p0, q0, q1, q2, q3);
|
|
} else {
|
|
if (vert) {
|
|
uint8_t *ptr = &plane0[offset - 4*stride];
|
|
*p3 = _mm_loadu_si128((__m128i *)ptr); ptr += stride;
|
|
*p2 = _mm_loadu_si128((__m128i *)ptr); ptr += stride;
|
|
*p1 = _mm_loadu_si128((__m128i *)ptr); ptr += stride;
|
|
*p0 = _mm_loadu_si128((__m128i *)ptr); ptr += stride;
|
|
*q0 = _mm_loadu_si128((__m128i *)ptr); ptr += stride;
|
|
*q1 = _mm_loadu_si128((__m128i *)ptr); ptr += stride;
|
|
*q2 = _mm_loadu_si128((__m128i *)ptr); ptr += stride;
|
|
*q3 = _mm_loadu_si128((__m128i *)ptr);
|
|
} else {
|
|
uint8_t *ptr = &plane0[offset - 4];
|
|
*p3 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*p3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*p3), (double *)ptr)); ptr += stride;
|
|
*p2 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*p2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*p2), (double *)ptr)); ptr += stride;
|
|
*p1 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*p1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*p1), (double *)ptr)); ptr += stride;
|
|
*p0 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*p0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*p0), (double *)ptr)); ptr += stride;
|
|
*q0 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*q0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*q0), (double *)ptr)); ptr += stride;
|
|
*q1 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*q1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*q1), (double *)ptr)); ptr += stride;
|
|
*q2 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*q2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*q2), (double *)ptr)); ptr += stride;
|
|
*q3 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
*q3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(*q3), (double *)ptr));
|
|
|
|
twp__transpose_bytes_16x8(p3, p2, p1, p0, q0, q1, q2, q3);
|
|
}
|
|
}
|
|
}
|
|
|
|
twp__INLINE static void twp__store_pixels_normal(uint8_t *plane0, uint8_t *plane1, int stride, int offset, int vert,
|
|
__m128i p3, __m128i p2, __m128i p1, __m128i p0, __m128i q0, __m128i q1, __m128i q2, __m128i q3)
|
|
{
|
|
// for some reason gcc requires _mm_storeh_pd to be 8-byte aligned, so we have to go through pain
|
|
// to avoid ub. unfortunate
|
|
double tmp;
|
|
|
|
int chroma = (plane0 && plane1);
|
|
if (chroma) {
|
|
if (!vert)
|
|
twp__transpose_bytes_16x8_reverse(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
|
|
|
uint8_t *ptr = vert ? &plane0[offset - 4*stride] : &plane0[offset - 4];
|
|
_mm_storel_epi64((__m128i *)ptr, p3); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, p2); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, p1); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, p0); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, q0); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, q1); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, q2); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, q3);
|
|
|
|
ptr = vert ? &plane1[offset - 4*stride] : &plane1[offset - 4];
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(p3)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(p2)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(p1)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(p0)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(q0)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(q1)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(q2)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(q3)); memcpy(ptr, &tmp, 8);
|
|
} else {
|
|
if (vert) {
|
|
uint8_t *ptr = &plane0[offset - 4*stride];
|
|
_mm_storeu_si128((__m128i *)ptr, p3); ptr += stride;
|
|
_mm_storeu_si128((__m128i *)ptr, p2); ptr += stride;
|
|
_mm_storeu_si128((__m128i *)ptr, p1); ptr += stride;
|
|
_mm_storeu_si128((__m128i *)ptr, p0); ptr += stride;
|
|
_mm_storeu_si128((__m128i *)ptr, q0); ptr += stride;
|
|
_mm_storeu_si128((__m128i *)ptr, q1); ptr += stride;
|
|
_mm_storeu_si128((__m128i *)ptr, q2); ptr += stride;
|
|
_mm_storeu_si128((__m128i *)ptr, q3);
|
|
} else {
|
|
twp__transpose_bytes_16x8_reverse(&p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
|
uint8_t *ptr = &plane0[offset - 4];
|
|
_mm_storel_epi64((__m128i *)ptr, p3); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(p3)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, p2); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(p2)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, p1); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(p1)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, p0); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(p0)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, q0); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(q0)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, q1); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(q1)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, q2); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(q2)); memcpy(ptr, &tmp, 8); ptr += stride;
|
|
_mm_storel_epi64((__m128i *)ptr, q3); ptr += stride;
|
|
_mm_storeh_pd(&tmp, _mm_castsi128_pd(q3)); memcpy(ptr, &tmp, 8);
|
|
}
|
|
}
|
|
}
|
|
|
|
twp__INLINE static __m128i twp__absdiff_u8(__m128i a, __m128i b)
|
|
{
|
|
__m128i res = _mm_sub_epi8(_mm_max_epu8(a, b), _mm_min_epu8(a, b));
|
|
return res;
|
|
}
|
|
|
|
twp__INLINE static void twp__compute_abs_diffs_normal(__m128i p3, __m128i p2, __m128i p1, __m128i p0,
|
|
__m128i q0, __m128i q1, __m128i q2, __m128i q3,
|
|
__m128i *ad_p3p2, __m128i *ad_p2p1, __m128i *ad_p1p0, __m128i *ad_q0q1,
|
|
__m128i *ad_q1q2, __m128i *ad_q2q3, __m128i *ad_p0q0, __m128i *ad_p1q1)
|
|
{
|
|
*ad_p3p2 = twp__absdiff_u8(p3, p2);
|
|
*ad_p2p1 = twp__absdiff_u8(p2, p1);
|
|
*ad_p1p0 = twp__absdiff_u8(p1, p0);
|
|
*ad_q0q1 = twp__absdiff_u8(q0, q1);
|
|
*ad_q1q2 = twp__absdiff_u8(q1, q2);
|
|
*ad_q2q3 = twp__absdiff_u8(q2, q3);
|
|
*ad_p0q0 = twp__absdiff_u8(p0, q0);
|
|
*ad_p1q1 = twp__absdiff_u8(p1, q1);
|
|
}
|
|
|
|
twp__INLINE static __m128i twp__flip_byte_msbs(__m128i bytes)
|
|
{
|
|
bytes = _mm_add_epi8(bytes, _mm_set1_epi8(-128));
|
|
return bytes;
|
|
}
|
|
|
|
twp__INLINE static void twp__flip_msbs_simple(__m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1)
|
|
{
|
|
*p1 = twp__flip_byte_msbs(*p1);
|
|
*p0 = twp__flip_byte_msbs(*p0);
|
|
*q0 = twp__flip_byte_msbs(*q0);
|
|
*q1 = twp__flip_byte_msbs(*q1);
|
|
}
|
|
|
|
twp__INLINE static void twp__flip_msbs_normal(__m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q2)
|
|
{
|
|
*p2 = twp__flip_byte_msbs(*p2);
|
|
twp__flip_msbs_simple(p1, p0, q0, q1);
|
|
*q2 = twp__flip_byte_msbs(*q2);
|
|
}
|
|
|
|
twp__INLINE static __m128i twp__threshold_simple(int edge_limit, __m128i ad_p0q0, __m128i ad_p1q1)
|
|
{
|
|
__m128i xmm_edge_limit = _mm_set1_epi8((int8_t)edge_limit);
|
|
__m128i simple0 = _mm_adds_epu8(ad_p0q0, ad_p0q0);
|
|
__m128i simple1 = twp__logical_right_shift_bytes_by_1(ad_p1q1);
|
|
__m128i simple2 = _mm_adds_epu8(simple0, simple1);
|
|
__m128i simple3 = _mm_max_epu8(simple2, xmm_edge_limit);
|
|
__m128i simple = _mm_cmpeq_epi8(simple3, xmm_edge_limit);
|
|
return simple;
|
|
}
|
|
|
|
twp__INLINE static __m128i twp__threshold_normal(int interior_limit, int edge_limit,
|
|
__m128i ad_p3p2, __m128i ad_p2p1, __m128i ad_p1p0, __m128i ad_q0q1,
|
|
__m128i ad_q1q2, __m128i ad_q2q3, __m128i ad_p0q0, __m128i ad_p1q1)
|
|
{
|
|
__m128i xmm_interior_limit = _mm_set1_epi8((int8_t)interior_limit);
|
|
|
|
__m128i normal0 = _mm_cmpeq_epi8(_mm_max_epu8(ad_p3p2, xmm_interior_limit), xmm_interior_limit);
|
|
__m128i normal1 = _mm_cmpeq_epi8(_mm_max_epu8(ad_p2p1, xmm_interior_limit), xmm_interior_limit);
|
|
__m128i normal2 = _mm_cmpeq_epi8(_mm_max_epu8(ad_p1p0, xmm_interior_limit), xmm_interior_limit);
|
|
__m128i normal3 = _mm_cmpeq_epi8(_mm_max_epu8(ad_q0q1, xmm_interior_limit), xmm_interior_limit);
|
|
__m128i normal4 = _mm_cmpeq_epi8(_mm_max_epu8(ad_q1q2, xmm_interior_limit), xmm_interior_limit);
|
|
__m128i normal5 = _mm_cmpeq_epi8(_mm_max_epu8(ad_q2q3, xmm_interior_limit), xmm_interior_limit);
|
|
|
|
__m128i simple = twp__threshold_simple(edge_limit, ad_p0q0, ad_p1q1);
|
|
|
|
__m128i th_mask;
|
|
th_mask = _mm_and_si128(normal0, normal1);
|
|
th_mask = _mm_and_si128(th_mask, normal2);
|
|
th_mask = _mm_and_si128(th_mask, normal3);
|
|
th_mask = _mm_and_si128(th_mask, normal4);
|
|
th_mask = _mm_and_si128(th_mask, normal5);
|
|
th_mask = _mm_and_si128(th_mask, simple);
|
|
|
|
return th_mask;
|
|
}
|
|
|
|
twp__INLINE static __m128i twp__hev_mask(int hev_threshold, __m128i ad_p1p0, __m128i ad_q0q1)
|
|
{
|
|
__m128i xmm_hev_threshold = _mm_set1_epi8((int8_t)(hev_threshold + 1));
|
|
__m128i t0 = _mm_cmpeq_epi8(_mm_max_epu8(ad_p1p0, xmm_hev_threshold), ad_p1p0);
|
|
__m128i t1 = _mm_cmpeq_epi8(_mm_max_epu8(ad_q0q1, xmm_hev_threshold), ad_q0q1);
|
|
__m128i hev_mask = _mm_or_si128(t0, t1);
|
|
return hev_mask;
|
|
}
|
|
|
|
twp__INLINE static __m128i twp__filter_common(__m128i p1, __m128i *p0, __m128i *q0, __m128i q1,
|
|
__m128i use_outer_taps, __m128i write_mask)
|
|
{
|
|
// this incurs a slight overhead for the normal mb and simple cases, because use_outer_taps is always 1
|
|
// (assuming the compiler doesn't inline this function and realize that, which it might)
|
|
__m128i a = _mm_and_si128(use_outer_taps, _mm_subs_epi8(p1, q1));
|
|
__m128i d_q0p0 = _mm_subs_epi8(*q0, *p0);
|
|
a = _mm_adds_epi8(a, d_q0p0);
|
|
a = _mm_adds_epi8(a, d_q0p0);
|
|
a = _mm_adds_epi8(a, d_q0p0);
|
|
|
|
__m128i b = _mm_adds_epi8(a, _mm_set1_epi8(3));
|
|
b = twp__arith_right_shift_bytes_by_3(b);
|
|
|
|
__m128i c = _mm_adds_epi8(a, _mm_set1_epi8(4));
|
|
c = twp__arith_right_shift_bytes_by_3(c);
|
|
|
|
__m128i d_q0c = _mm_subs_epi8(*q0, c);
|
|
__m128i s_p0b = _mm_adds_epi8(*p0, b);
|
|
*q0 = _mm_or_si128(_mm_and_si128(write_mask, d_q0c), _mm_andnot_si128(write_mask, *q0));
|
|
*p0 = _mm_or_si128(_mm_and_si128(write_mask, s_p0b), _mm_andnot_si128(write_mask, *p0));
|
|
|
|
return c;
|
|
}
|
|
|
|
static void twp__normal_filter_mb(int edge_limit, int interior_limit, int hev_threshold,
|
|
uint8_t *plane0, uint8_t *plane1, int stride, int offset, int vert)
|
|
{
|
|
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
|
|
__m128i ad_p3p2, ad_p2p1, ad_p1p0, ad_q0q1, ad_q1q2, ad_q2q3, ad_p0q0, ad_p1q1;
|
|
|
|
twp__load_pixels_normal(plane0, plane1, stride, offset, vert, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
|
twp__compute_abs_diffs_normal(p3, p2, p1, p0, q0, q1, q2, q3, &ad_p3p2, &ad_p2p1, &ad_p1p0, &ad_q0q1, &ad_q1q2, &ad_q2q3, &ad_p0q0, &ad_p1q1);
|
|
twp__flip_msbs_normal(&p2, &p1, &p0, &q0, &q1, &q2); // need to convert to signed after we took the absolute differences because _mm_min_epu8 and _mm_min_epu8 take unsigned values
|
|
|
|
__m128i th_mask = twp__threshold_normal(interior_limit, edge_limit, ad_p3p2, ad_p2p1, ad_p1p0, ad_q0q1, ad_q1q2, ad_q2q3, ad_p0q0, ad_p1q1);
|
|
__m128i is_hev_mask = twp__hev_mask(hev_threshold, ad_p1p0, ad_q0q1);
|
|
|
|
__m128i hev_and_th = _mm_and_si128(is_hev_mask, th_mask);
|
|
__m128i hev_or_not_th = _mm_or_si128(is_hev_mask, _mm_andnot_si128(th_mask, _mm_set1_epi8(-1/*0xff*/)));
|
|
|
|
twp__filter_common(p1, &p0, &q0, q1, _mm_set1_epi8(-1/*0xff*/), hev_and_th);
|
|
|
|
__m128i w = _mm_subs_epi8(p1, q1);
|
|
__m128i d_q0p0 = _mm_subs_epi8(q0, p0);
|
|
w = _mm_adds_epi8(w, d_q0p0);
|
|
w = _mm_adds_epi8(w, d_q0p0);
|
|
w = _mm_adds_epi8(w, d_q0p0);
|
|
|
|
// convert to 16-bit with sign extension
|
|
__m128i tmp = _mm_cmplt_epi8(w, _mm_setzero_si128());
|
|
__m128i w0 = _mm_unpacklo_epi8(w, tmp);
|
|
__m128i w1 = _mm_unpackhi_epi8(w, tmp);
|
|
|
|
// compute 9*w using a shift and an add instead of a mul, which should be slightly faster
|
|
__m128i w9_0 = _mm_add_epi16(_mm_slli_epi16(w0, 3), w0);
|
|
__m128i w9_1 = _mm_add_epi16(_mm_slli_epi16(w1, 3), w1);
|
|
|
|
// we can compute 9*w, 18*w and 27*w by just using adds instead of muls,
|
|
// since 18*w = 9*w + 9*w and 27*w = 18*w + 9*w
|
|
__m128i c0 = _mm_add_epi16(w9_0, _mm_set1_epi16(63)); // 9*w + 63
|
|
__m128i c1 = _mm_add_epi16(w9_1, _mm_set1_epi16(63));
|
|
__m128i b0 = _mm_add_epi16(c0, w9_0); // 18*w + 63
|
|
__m128i b1 = _mm_add_epi16(c1, w9_1);
|
|
__m128i a0 = _mm_add_epi16(b0, w9_0); // 27*w + 63
|
|
__m128i a1 = _mm_add_epi16(b1, w9_1);
|
|
a0 = _mm_srai_epi16(a0, 7);
|
|
a1 = _mm_srai_epi16(a1, 7);
|
|
b0 = _mm_srai_epi16(b0, 7);
|
|
b1 = _mm_srai_epi16(b1, 7);
|
|
c0 = _mm_srai_epi16(c0, 7);
|
|
c1 = _mm_srai_epi16(c1, 7);
|
|
|
|
__m128i c = _mm_packs_epi16(c0, c1);
|
|
__m128i b = _mm_packs_epi16(b0, b1);
|
|
__m128i a = _mm_packs_epi16(a0, a1);
|
|
|
|
__m128i d_q0a = _mm_subs_epi8(q0, a);
|
|
__m128i s_p0a = _mm_adds_epi8(p0, a);
|
|
q0 = _mm_or_si128(_mm_and_si128(hev_or_not_th, q0), _mm_andnot_si128(hev_or_not_th, d_q0a));
|
|
p0 = _mm_or_si128(_mm_and_si128(hev_or_not_th, p0), _mm_andnot_si128(hev_or_not_th, s_p0a));
|
|
|
|
__m128i d_q1b = _mm_subs_epi8(q1, b);
|
|
__m128i s_p1b = _mm_adds_epi8(p1, b);
|
|
q1 = _mm_or_si128(_mm_and_si128(hev_or_not_th, q1), _mm_andnot_si128(hev_or_not_th, d_q1b));
|
|
p1 = _mm_or_si128(_mm_and_si128(hev_or_not_th, p1), _mm_andnot_si128(hev_or_not_th, s_p1b));
|
|
|
|
__m128i d_q2c = _mm_subs_epi8(q2, c);
|
|
__m128i s_p2c = _mm_adds_epi8(p2, c);
|
|
q2 = _mm_or_si128(_mm_and_si128(hev_or_not_th, q2), _mm_andnot_si128(hev_or_not_th, d_q2c));
|
|
p2 = _mm_or_si128(_mm_and_si128(hev_or_not_th, p2), _mm_andnot_si128(hev_or_not_th, s_p2c));
|
|
|
|
twp__flip_msbs_normal(&p2, &p1, &p0, &q0, &q1, &q2);
|
|
twp__store_pixels_normal(plane0, plane1, stride, offset, vert, p3, p2, p1, p0, q0, q1, q2, q3);
|
|
}
|
|
|
|
static void twp__normal_filter_sb(int edge_limit, int interior_limit, int hev_threshold,
|
|
uint8_t *plane0, uint8_t *plane1, int stride, int offset, int vert)
|
|
{
|
|
int chroma = (plane0 && plane1);
|
|
for (int j = 0; j < (chroma ? 1 : 3); ++j) {
|
|
int sb_offset = offset;
|
|
if (vert)
|
|
sb_offset += (j+1)*4*stride;
|
|
else
|
|
sb_offset += (j+1)*4;
|
|
|
|
__m128i p3, p2, p1, p0, q0, q1, q2, q3;
|
|
__m128i ad_p3p2, ad_p2p1, ad_p1p0, ad_q0q1, ad_q1q2, ad_q2q3, ad_p0q0, ad_p1q1;
|
|
|
|
twp__load_pixels_normal(plane0, plane1, stride, sb_offset, vert, &p3, &p2, &p1, &p0, &q0, &q1, &q2, &q3);
|
|
twp__compute_abs_diffs_normal(p3, p2, p1, p0, q0, q1, q2, q3, &ad_p3p2, &ad_p2p1, &ad_p1p0, &ad_q0q1, &ad_q1q2, &ad_q2q3, &ad_p0q0, &ad_p1q1);
|
|
twp__flip_msbs_normal(&p2, &p1, &p0, &q0, &q1, &q2); // need to convert to signed after we took the absolute differences because _mm_min_epu8 and _mm_min_epu8 take unsigned values
|
|
|
|
__m128i th_mask = twp__threshold_normal(interior_limit, edge_limit, ad_p3p2, ad_p2p1, ad_p1p0, ad_q0q1, ad_q1q2, ad_q2q3, ad_p0q0, ad_p1q1);
|
|
__m128i is_hev_mask = twp__hev_mask(hev_threshold, ad_p1p0, ad_q0q1);
|
|
__m128i not_hev_and_th = _mm_andnot_si128(is_hev_mask, th_mask);
|
|
|
|
__m128i a = twp__filter_common(p1, &p0, &q0, q1, is_hev_mask, th_mask);
|
|
|
|
// to compute (a + 1) >> 1 with "a" being signed, we do this:
|
|
// 1. convert back to unsigned by flipping the msb
|
|
// 2. use _mm_avg_epu8 to compute (a + 1) >> 1 on the unsigned value
|
|
// 3. add 0b11000000 (-64) to flip the msb back and sign-extend; this works because if
|
|
// bit 7 (the previous msb) is 1 (meaning the value should be positive since it's flipped),
|
|
// after the add bit 7 and 8 will be 0 because of the carry. if bit 7 is 0, then we want
|
|
// bit 7 and 8 to be 1, and since there is no carry in that case, everything works
|
|
a = twp__flip_byte_msbs(a);
|
|
a = _mm_avg_epu8(a, _mm_setzero_si128());
|
|
a = _mm_add_epi8(a, _mm_set1_epi8(-64));
|
|
|
|
__m128i d_q1a = _mm_subs_epi8(q1, a);
|
|
__m128i s_p1a = _mm_adds_epi8(p1, a);
|
|
q1 = _mm_or_si128(_mm_and_si128(not_hev_and_th, d_q1a), _mm_andnot_si128(not_hev_and_th, q1));
|
|
p1 = _mm_or_si128(_mm_and_si128(not_hev_and_th, s_p1a), _mm_andnot_si128(not_hev_and_th, p1));
|
|
|
|
twp__flip_msbs_normal(&p2, &p1, &p0, &q0, &q1, &q2);
|
|
twp__store_pixels_normal(plane0, plane1, stride, sb_offset, vert, p3, p2, p1, p0, q0, q1, q2, q3);
|
|
}
|
|
}
|
|
|
|
static void twp__simple_filter_mb(int edge_limit, uint8_t *plane, int stride, int offset, int vert)
|
|
{
|
|
__m128i p1, p0, q0, q1;
|
|
if (vert) {
|
|
uint8_t *ptr = &plane[offset - 2*stride];
|
|
p1 = _mm_loadu_si128((__m128i *)ptr); ptr += stride;
|
|
p0 = _mm_loadu_si128((__m128i *)ptr); ptr += stride;
|
|
q0 = _mm_loadu_si128((__m128i *)ptr); ptr += stride;
|
|
q1 = _mm_loadu_si128((__m128i *)ptr);
|
|
} else {
|
|
uint8_t *ptr = &plane[offset - 2];
|
|
|
|
__m128i i0 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
i0 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(i0), (double *)ptr)); ptr += stride;
|
|
__m128i i1 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
i1 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(i1), (double *)ptr)); ptr += stride;
|
|
__m128i i2 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
i2 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(i2), (double *)ptr)); ptr += stride;
|
|
__m128i i3 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
i3 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(i3), (double *)ptr)); ptr += stride;
|
|
__m128i i4 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
i4 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(i4), (double *)ptr)); ptr += stride;
|
|
__m128i i5 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
i5 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(i5), (double *)ptr)); ptr += stride;
|
|
__m128i i6 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
i6 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(i6), (double *)ptr)); ptr += stride;
|
|
__m128i i7 = _mm_loadl_epi64((__m128i *)ptr); ptr += stride;
|
|
i7 = _mm_castpd_si128(_mm_loadh_pd(_mm_castsi128_pd(i7), (double *)ptr)); ptr += stride;
|
|
|
|
// i0 = 0 1 2 3 .. .. .. .. 4 5 6 7 .. .. .. ..
|
|
// i1 = 8 9 10 11 .. .. .. .. 12 13 14 15 .. .. .. ..
|
|
// i2 = 16 17 18 19 .. .. .. .. 20 21 22 23 .. .. .. ..
|
|
// i3 = 24 25 26 27 .. .. .. .. 28 29 30 31 .. .. .. ..
|
|
// i4 = 32 33 34 35 .. .. .. .. 36 37 38 39 .. .. .. ..
|
|
// i5 = 40 41 42 43 .. .. .. .. 44 45 46 47 .. .. .. ..
|
|
// i6 = 48 49 50 51 .. .. .. .. 52 53 54 55 .. .. .. ..
|
|
// i7 = 56 57 58 59 .. .. .. .. 60 61 62 63 .. .. .. ..
|
|
|
|
__m128i t0 = _mm_unpacklo_epi8(i0, i1); // 0 8 1 9 2 10 3 11 .. .. .. .. .. .. .. ..
|
|
__m128i t1 = _mm_unpacklo_epi8(i2, i3); // 16 24 17 25 18 26 19 27 .. .. .. .. .. .. .. ..
|
|
__m128i t2 = _mm_unpacklo_epi8(i4, i5); // 32 40 33 41 34 42 35 43 .. .. .. .. .. .. .. ..
|
|
__m128i t3 = _mm_unpacklo_epi8(i6, i7); // 48 56 49 57 50 58 51 59 .. .. .. .. .. .. .. ..
|
|
__m128i t4 = _mm_unpackhi_epi8(i0, i1); // 4 12 5 13 6 14 7 15 .. .. .. .. .. .. .. ..
|
|
__m128i t5 = _mm_unpackhi_epi8(i2, i3); // 20 28 21 29 22 30 23 31 .. .. .. .. .. .. .. ..
|
|
__m128i t6 = _mm_unpackhi_epi8(i4, i5); // 36 44 37 45 38 46 39 47 .. .. .. .. .. .. .. ..
|
|
__m128i t7 = _mm_unpackhi_epi8(i6, i7); // 52 60 53 61 54 62 55 63 .. .. .. .. .. .. .. ..
|
|
|
|
__m128i t8 = _mm_unpacklo_epi8(t0, t4); // 0 4 8 12 1 5 9 13 2 6 10 14 3 7 11 15
|
|
__m128i t9 = _mm_unpacklo_epi8(t1, t5); // 16 20 24 28 17 21 25 29 18 22 26 30 19 23 27 31
|
|
__m128i t10 = _mm_unpacklo_epi8(t2, t6); // 32 36 40 44 33 37 41 45 34 38 42 46 35 39 43 47
|
|
__m128i t11 = _mm_unpacklo_epi8(t3, t7); // 48 52 56 60 49 53 57 61 50 54 58 62 51 55 59 63
|
|
|
|
__m128i t12 = _mm_unpacklo_epi32(t8, t9); // 0 4 8 12 16 20 24 28 1 5 9 13 17 21 25 29
|
|
__m128i t13 = _mm_unpacklo_epi32(t10, t11); // 32 36 40 44 48 52 56 60 33 37 41 45 49 53 57 61
|
|
__m128i t14 = _mm_unpackhi_epi32(t8, t9); // 2 6 10 14 18 22 26 30 3 7 11 15 19 23 27 31
|
|
__m128i t15 = _mm_unpackhi_epi32(t10, t11); // 34 38 42 46 50 54 58 62 35 39 43 47 51 55 59 63
|
|
|
|
p1 = _mm_unpacklo_epi64(t12, t13); // 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
|
|
p0 = _mm_unpackhi_epi64(t12, t13); // 1 5 9 13 17 21 25 29 33 37 41 45 49 53 57 61
|
|
q0 = _mm_unpacklo_epi64(t14, t15); // 2 6 10 14 18 22 26 30 34 38 42 46 50 54 58 62
|
|
q1 = _mm_unpackhi_epi64(t14, t15); // 3 7 11 15 19 23 27 31 35 39 43 47 51 55 59 63
|
|
}
|
|
|
|
__m128i ad_p0q0 = twp__absdiff_u8(p0, q0);
|
|
__m128i ad_p1q1 = twp__absdiff_u8(p1, q1);
|
|
twp__flip_msbs_simple(&p1, &p0, &q0, &q1);
|
|
__m128i threshold = twp__threshold_simple(edge_limit, ad_p0q0, ad_p1q1);
|
|
twp__filter_common(p1, &p0, &q0, q1, _mm_set1_epi8(-1/*0xff*/), threshold);
|
|
twp__flip_msbs_simple(&p1, &p0, &q0, &q1);
|
|
|
|
if (vert) {
|
|
uint8_t *ptr = &plane[offset - 1*stride];
|
|
_mm_storeu_si128((__m128i *)ptr, p0); ptr += stride;
|
|
_mm_storeu_si128((__m128i *)ptr, q0);
|
|
} else {
|
|
uint8_t *ptr = &plane[offset - 2];
|
|
|
|
// p1 = 0 4 8 12 16 20 24 28 32 36 40 44 48 52 56 60
|
|
// p0 = 1 5 9 13 17 21 25 29 33 37 41 45 49 53 57 61
|
|
// q0 = 2 6 10 14 18 22 26 30 34 38 42 46 50 54 58 62
|
|
// q1 = 3 7 11 15 19 23 27 31 35 39 43 47 51 55 59 63
|
|
|
|
__m128i t0 = _mm_unpacklo_epi8(p1, p0); // 0 1 4 5 8 9 12 13 16 17 20 21 24 25 28 29
|
|
__m128i t1 = _mm_unpacklo_epi8(q0, q1); // 2 3 6 7 10 11 14 15 18 19 22 23 26 27 30 31
|
|
__m128i t2 = _mm_unpackhi_epi8(p1, p0); // 32 33 36 37 40 41 44 45 48 49 52 53 56 57 60 61
|
|
__m128i t3 = _mm_unpackhi_epi8(q0, q1); // 34 35 38 39 42 43 46 47 50 51 54 55 58 59 62 63
|
|
|
|
__m128i t4 = _mm_unpacklo_epi16(t0, t1); // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
|
|
__m128i t5 = _mm_unpacklo_epi16(t2, t3); // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
|
|
__m128i t6 = _mm_unpackhi_epi16(t0, t1); // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
|
|
__m128i t7 = _mm_unpackhi_epi16(t2, t3); // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
|
|
|
|
int i;
|
|
i = _mm_cvtsi128_si32(t4); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t4, 4)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t4, 8)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t4, 12)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(t6); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t6, 4)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t6, 8)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t6, 12)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(t5); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t5, 4)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t5, 8)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t5, 12)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(t7); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t7, 4)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t7, 8)); memcpy(ptr, &i, 4); ptr += stride;
|
|
i = _mm_cvtsi128_si32(_mm_srli_si128(t7, 12)); memcpy(ptr, &i, 4);
|
|
}
|
|
}
|
|
|
|
#endif
|
|
|
|
static void twp__simple_filter_sb(int edge_limit, uint8_t *plane, int stride, int mb_offset, int vert)
|
|
{
|
|
for (int i = 0; i < 3; ++i) {
|
|
int offset = mb_offset;
|
|
if (vert)
|
|
offset += (i+1)*4*stride;
|
|
else
|
|
offset += (i+1)*4;
|
|
twp__simple_filter_mb(edge_limit, plane, stride, offset, vert);
|
|
}
|
|
}
|
|
|
|
static void twp__do_loop_filtering(twp__vp8_data *data)
|
|
{
|
|
for (int mb_y = 0; mb_y < data->mbs_per_col; ++mb_y) {
|
|
for (int mb_x = 0; mb_x < data->mbs_per_row; ++mb_x) {
|
|
int mb_i = mb_y*data->mbs_per_row + mb_x;
|
|
twp__mb_info *mb_info = data->mb_infos + mb_i;
|
|
int mb_luma_offset = twp__calc_mb_offset(mb_x, mb_y, data->luma_stride, 0);
|
|
int mb_chroma_offset = twp__calc_mb_offset(mb_x, mb_y, data->chroma_stride, 1);
|
|
|
|
int filter_level = twp__get_filter_level(data, mb_info);
|
|
if (filter_level == 0) continue;
|
|
int interior_limit = twp__get_interior_limit(filter_level, data->loop_filter_sharpness);
|
|
int hev_threshold = twp__get_hev_threshold(filter_level);
|
|
int mb_edge_limit, sb_edge_limit;
|
|
twp__get_edge_limits(filter_level, interior_limit, &mb_edge_limit, &sb_edge_limit);
|
|
|
|
if (data->loop_filter_type == twp__FILTER_NORMAL) {
|
|
if (mb_x != 0) {
|
|
twp__normal_filter_mb(mb_edge_limit, interior_limit, hev_threshold, data->plane_y, NULL, data->luma_stride, mb_luma_offset, 0);
|
|
twp__normal_filter_mb(mb_edge_limit, interior_limit, hev_threshold, data->plane_u, data->plane_v, data->chroma_stride, mb_chroma_offset, 0);
|
|
}
|
|
if (!mb_info->skip_sb_filtering) {
|
|
twp__normal_filter_sb(sb_edge_limit, interior_limit, hev_threshold, data->plane_y, NULL, data->luma_stride, mb_luma_offset, 0);
|
|
twp__normal_filter_sb(sb_edge_limit, interior_limit, hev_threshold, data->plane_u, data->plane_v, data->chroma_stride, mb_chroma_offset, 0);
|
|
}
|
|
|
|
if (mb_y != 0) {
|
|
twp__normal_filter_mb(mb_edge_limit, interior_limit, hev_threshold, data->plane_y, NULL, data->luma_stride, mb_luma_offset, 1);
|
|
twp__normal_filter_mb(mb_edge_limit, interior_limit, hev_threshold, data->plane_u, data->plane_v, data->chroma_stride, mb_chroma_offset, 1);
|
|
}
|
|
if (!mb_info->skip_sb_filtering) {
|
|
twp__normal_filter_sb(sb_edge_limit, interior_limit, hev_threshold, data->plane_y, NULL, data->luma_stride, mb_luma_offset, 1);
|
|
twp__normal_filter_sb(sb_edge_limit, interior_limit, hev_threshold, data->plane_u, data->plane_v, data->chroma_stride, mb_chroma_offset, 1);
|
|
}
|
|
} else if (data->loop_filter_type == twp__FILTER_SIMPLE) {
|
|
if (mb_x != 0)
|
|
twp__simple_filter_mb(mb_edge_limit, data->plane_y, data->luma_stride, mb_luma_offset, 0);
|
|
if (!mb_info->skip_sb_filtering)
|
|
twp__simple_filter_sb(sb_edge_limit, data->plane_y, data->luma_stride, mb_luma_offset, 0);
|
|
if (mb_y != 0)
|
|
twp__simple_filter_mb(mb_edge_limit, data->plane_y, data->luma_stride, mb_luma_offset, 1);
|
|
if (!mb_info->skip_sb_filtering)
|
|
twp__simple_filter_sb(sb_edge_limit, data->plane_y, data->luma_stride, mb_luma_offset, 1);
|
|
} else {
|
|
twp__assert(0);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
static uint8_t *twp_yuv_to_rgb(uint8_t *plane_y, uint8_t *plane_u, uint8_t *plane_v,
|
|
int img_width, int img_height, int luma_stride, int chroma_stride, int rgba)
|
|
{
|
|
// https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion
|
|
|
|
// this uses 16.16 fixed point and takes advantage of _mm_mulhi_epu16. however, that means that
|
|
// we lose the decimal part immediately after the multiply, so we lose some information there.
|
|
// seems good enough, though. i can't really perceive any difference compared to a "perfect"
|
|
// floating point implementation
|
|
|
|
int comp = rgba ? 4 : 3;
|
|
uint8_t *mem = (uint8_t *)malloc(img_width * img_height * comp);
|
|
|
|
uint8_t *src_y = plane_y + luma_stride + 1;
|
|
uint8_t *src_u = plane_u + chroma_stride + 1;
|
|
uint8_t *src_v = plane_v + chroma_stride + 1;
|
|
uint8_t *dst = mem;
|
|
|
|
for (int row = 0; row < img_height; ++row) {
|
|
int col = 0;
|
|
|
|
#ifdef twp__SSE2
|
|
// todo: i don't know if there is a good way to convert to rgb without pshufb, which we can't use
|
|
// because we are limited to sse2. so yuv->rgb conversion just always uses the non-simd path
|
|
for (; rgba && col+8 <= img_width; col += 8) {
|
|
// load and upsample u, v
|
|
__m128i y = _mm_loadl_epi64((__m128i *)(src_y + col));
|
|
int i;
|
|
memcpy(&i, src_u + col/2, 4);
|
|
__m128i u = _mm_cvtsi32_si128(i);
|
|
u = _mm_unpacklo_epi8(u, u);
|
|
memcpy(&i, src_v + col/2, 4);
|
|
__m128i v = _mm_cvtsi32_si128(i);
|
|
v = _mm_unpacklo_epi8(v, v);
|
|
__m128i a = _mm_set1_epi16(255);
|
|
|
|
// convert to 16-bit and shift left by 8
|
|
__m128i yw = _mm_unpacklo_epi8(_mm_setzero_si128(), y);
|
|
__m128i uw = _mm_unpacklo_epi8(_mm_setzero_si128(), u);
|
|
__m128i vw = _mm_unpacklo_epi8(_mm_setzero_si128(), v);
|
|
|
|
// do the actual computation
|
|
__m128i t0 = _mm_mulhi_epu16(yw, _mm_set1_epi16((short)(1.164*256+0.5)));
|
|
__m128i t1 = _mm_mulhi_epu16(vw, _mm_set1_epi16((short)(1.596*256+0.5)));
|
|
__m128i t2 = _mm_add_epi16(t0, t1);
|
|
__m128i r = _mm_sub_epi16(t2, _mm_set1_epi16(223));
|
|
|
|
__m128i t3 = _mm_mulhi_epu16(uw, _mm_set1_epi16((short)(0.392*256+0.5)));
|
|
__m128i t4 = _mm_mulhi_epu16(vw, _mm_set1_epi16((short)(0.813*256+0.5)));
|
|
__m128i t5 = _mm_sub_epi16(t0, t3);
|
|
__m128i t6 = _mm_sub_epi16(t5, t4);
|
|
__m128i g = _mm_add_epi16(t6, _mm_set1_epi16(135)); // 135 seems to work better than 136
|
|
|
|
__m128i t7 = _mm_mulhi_epu16(uw, _mm_set1_epi16((short)(2.017*256+0.5)));
|
|
__m128i t8 = _mm_add_epi16(t0, t7);
|
|
__m128i b = _mm_sub_epi16(t8, _mm_set1_epi16(277));
|
|
|
|
// convert back to 8-bit and transpose
|
|
__m128i rb = _mm_packus_epi16(r, b); // rrrrrrrrbbbbbbbb
|
|
__m128i ga = _mm_packus_epi16(g, a); // ggggggggaaaaaaaa
|
|
__m128i rg = _mm_unpacklo_epi8(rb, ga); // rgrgrgrgrgrgrgrg
|
|
__m128i ba = _mm_unpackhi_epi8(rb, ga); // babababababababa
|
|
__m128i rgba0 = _mm_unpacklo_epi16(rg, ba); // rgbargbargbargba
|
|
__m128i rgba1 = _mm_unpackhi_epi16(rg, ba); // rgbargbargbargba
|
|
|
|
// store
|
|
_mm_storeu_si128((__m128i *)(dst + 0), rgba0);
|
|
_mm_storeu_si128((__m128i *)(dst + 16), rgba1);
|
|
dst += 32;
|
|
}
|
|
#endif
|
|
|
|
for (; col < img_width; ++col) {
|
|
int y = (int)src_y[col];
|
|
int u = (int)src_u[col>>1];
|
|
int v = (int)src_v[col>>1];
|
|
|
|
#define twp__FPMUL(c, x) (((int)((c)*256+0.5) * ((x)<<8)) >> 16)
|
|
int ytmp = twp__FPMUL(1.164, y);
|
|
int r = ytmp + twp__FPMUL(1.596, v) - 223;
|
|
int g = ytmp - twp__FPMUL(0.392, u) - twp__FPMUL(0.813, v) + 135;
|
|
int b = ytmp + twp__FPMUL(2.017, u) - 277;
|
|
#undef twp__FPMUL
|
|
|
|
*dst++ = (uint8_t)twp__clamp(r, 0, 255);
|
|
*dst++ = (uint8_t)twp__clamp(g, 0, 255);
|
|
*dst++ = (uint8_t)twp__clamp(b, 0, 255);
|
|
if (rgba) *dst++ = 255;
|
|
}
|
|
|
|
src_y += luma_stride;
|
|
if (row & 1) {
|
|
src_u += chroma_stride;
|
|
src_v += chroma_stride;
|
|
}
|
|
}
|
|
|
|
return mem;
|
|
}
|
|
|
|
static void twp__vp8_init(twp__vp8_data *data)
|
|
{
|
|
memset(data, 0, sizeof(*data));
|
|
|
|
twp__assert(sizeof(twp__default_coeff_probs) == sizeof(twp__default_coeff_probs));
|
|
memcpy(data->coeff_probs, twp__default_coeff_probs, sizeof(twp__default_coeff_probs));
|
|
|
|
for (int i = 0; i < twp__arrlen(data->segment_id_tree_probs); ++i)
|
|
data->segment_id_tree_probs[i] = 255;
|
|
}
|
|
|
|
static uint8_t *twp__read_vp8(void *raw_bytes, int num_bytes, int *width, int *height, twp_format format, twp_flags flags)
|
|
{
|
|
uint8_t *result = NULL;
|
|
|
|
twp__vp8_data data;
|
|
twp__vp8_init(&data);
|
|
|
|
if (!twp__read_vp8_header(&data, (uint8_t *)raw_bytes, num_bytes)) goto end;
|
|
if (!twp__read_yuv_data(&data, format)) goto end;
|
|
|
|
if (!(flags & twp_FLAG_SKIP_LOOP_FILTER))
|
|
twp__do_loop_filtering(&data);
|
|
|
|
*width = data.width;
|
|
*height = data.height;
|
|
|
|
if (format == twp_FORMAT_YUV || format == twp_FORMAT_YUVA) {
|
|
result = data.plane_y;
|
|
goto end;
|
|
}
|
|
|
|
result = twp_yuv_to_rgb(data.plane_y, data.plane_u, data.plane_v, data.width,
|
|
data.height, data.luma_stride, data.chroma_stride,
|
|
format == twp_FORMAT_RGBA);
|
|
|
|
end:;
|
|
|
|
if (format != twp_FORMAT_YUV && format != twp_FORMAT_YUVA)
|
|
free(data.plane_y);
|
|
free(data.mb_infos);
|
|
|
|
return result;
|
|
}
|
|
|
|
static int twp__read_alpha(void *data, int data_len, int width, int height, unsigned char *pix, int yuva)
|
|
{
|
|
unsigned char *u8data = (unsigned char *)data;
|
|
|
|
int hdr = *u8data;
|
|
// int pp = (hdr >> 4) & 0x3; // we don't do anything with this
|
|
int filter = (hdr >> 2) & 0x3;
|
|
int compression = hdr & 0x3;
|
|
|
|
if (compression != 0 && compression != 1)
|
|
return 0;
|
|
|
|
int src_pixsize;
|
|
unsigned char *buf;
|
|
unsigned char *src;
|
|
if (compression == 0) {
|
|
if (data_len-1 != width*height)
|
|
return 0;
|
|
buf = NULL;
|
|
src_pixsize = 1;
|
|
src = u8data + 1;
|
|
} else {
|
|
buf = twp__read_vp8l(u8data + 1, data_len-1, &width, &height, twp_FORMAT_RGBA, 1);
|
|
if (!buf)
|
|
return 0;
|
|
src_pixsize = 4;
|
|
src = buf + 1;
|
|
}
|
|
|
|
// this is a really slow way of doing this, but it seems this is never a hotspot, so i don't care
|
|
|
|
// when yuva = 1, we don't have an rgba image in which we want to fill in the alpha channel values.
|
|
// instead, we just want to directly return the planar alpha channel
|
|
int dst_pixsize;
|
|
int dst_stride;
|
|
unsigned char *dst;
|
|
if (yuva) {
|
|
twp_unpack_yuv(pix, width, height, NULL, NULL, NULL, &dst, NULL, NULL, &dst_stride);
|
|
dst_pixsize = 1;
|
|
} else {
|
|
dst_stride = width * 4;
|
|
dst = pix + 3;
|
|
dst_pixsize = 4;
|
|
}
|
|
|
|
for (int y = 0; y < height; ++y) {
|
|
for (int x = 0; x < width; ++x) {
|
|
int A, B, C;
|
|
if (x > 0 && y > 0) {
|
|
A = dst[-dst_pixsize];
|
|
B = dst[-dst_stride];
|
|
C = dst[-dst_stride - dst_pixsize];
|
|
} else if (x > 0) {
|
|
A = dst[-dst_pixsize];
|
|
B = A;
|
|
C = A;
|
|
} else if (y > 0) {
|
|
B = dst[-dst_stride];
|
|
A = B;
|
|
C = B;
|
|
} else {
|
|
A = 0;
|
|
B = 0;
|
|
C = 0;
|
|
}
|
|
|
|
int pred;
|
|
switch (filter) {
|
|
case 0: pred = 0; break;
|
|
case 1: pred = A; break;
|
|
case 2: pred = B; break;
|
|
case 3: pred = twp__clamp(A + B - C, 0, 255); break;
|
|
default: pred = 0; twp__assert(0); break;
|
|
}
|
|
|
|
*dst = (uint8_t)(*src + pred);
|
|
dst += dst_pixsize;
|
|
src += src_pixsize;
|
|
}
|
|
}
|
|
|
|
free(buf);
|
|
return 1;
|
|
}
|
|
|
|
static int twp__read_chunks(unsigned char *u8data, int data_len, twp__chunk_table *table)
|
|
{
|
|
// the spec says chunks should be in a specific order, but we don't really care,
|
|
// so we don't bother checking
|
|
|
|
memset(table, 0, sizeof(*table));
|
|
|
|
// this needs 2 variables because first can be NULL
|
|
int first_iter = 1;
|
|
twp__chunk *first = NULL;
|
|
|
|
unsigned char *chunk = u8data + 12;
|
|
while (chunk+8 <= u8data+data_len) {
|
|
uint32_t size = (uint32_t)chunk[4] | ((uint32_t)chunk[5] << 8) | ((uint32_t)chunk[6] << 16) | ((uint32_t)chunk[7] << 24);
|
|
if (size > INT32_MAX) return 0;
|
|
|
|
unsigned char *chunk_data = chunk + 8;
|
|
if (chunk_data+size > u8data+data_len) return 0;
|
|
|
|
twp__chunk *c = NULL;
|
|
if (twp__check_fourcc(chunk, "ANIM")) return 0; // not supported
|
|
else if (twp__check_fourcc(chunk, "VP8X")) c = &table->VP8X;
|
|
else if (twp__check_fourcc(chunk, "VP8L")) c = &table->VP8L;
|
|
else if (twp__check_fourcc(chunk, "VP8 ")) c = &table->VP8;
|
|
else if (twp__check_fourcc(chunk, "ALPH")) c = &table->ALPH;
|
|
|
|
if (c) {
|
|
if (c->data) return 0; // found same chunk twice
|
|
c->data = chunk_data;
|
|
c->size = (int)size;
|
|
}
|
|
|
|
if (first_iter) {
|
|
first = c;
|
|
first_iter = 0;
|
|
}
|
|
|
|
chunk += 8 + size;
|
|
if (size & 1) ++chunk; // if chunk size is odd, a single padding byte is added
|
|
}
|
|
|
|
// validate
|
|
|
|
// while we genrally do not care about chunk order, twp_get_info() requires that the first chunk
|
|
// is correct, since it only reads that one. so to be consistent, we enforce this here as well
|
|
if (first != &table->VP8X && first != &table->VP8L && first != &table->VP8)
|
|
return 0;
|
|
|
|
if (table->ALPH.data && !table->VP8X.data) return 0; // alpha chunk is only allowed in the extended format
|
|
if (!table->VP8L.data && !table->VP8.data) return 0; // must have either lossless or lossy chunk
|
|
if (table->VP8L.data && table->ALPH.data) return 0; // can't have an alpha chunk with a lossless chunk
|
|
if (table->VP8L.data && table->VP8.data) return 0; // can't have both lossless and lossy chunks
|
|
|
|
return 1;
|
|
}
|
|
|
|
twp__STORAGE unsigned char *twp_read_from_memory(void *data, int data_len, int *width, int *height, twp_format format, twp_flags flags)
|
|
{
|
|
unsigned char *u8data = (unsigned char *)data;
|
|
|
|
if (data_len < twp__MIN_FILE_SIZE) return NULL;
|
|
if (!twp__check_fourcc(u8data, "RIFF")) return NULL;
|
|
if (!twp__check_fourcc(u8data + 8, "WEBP")) return NULL;
|
|
|
|
uint32_t hdr_file_size = (uint32_t)u8data[4] | ((uint32_t)u8data[5] << 8) | ((uint32_t)u8data[6] << 16) | ((uint32_t)u8data[7] << 24);
|
|
if ((uint64_t)hdr_file_size+8 > INT32_MAX) return NULL; // not supported, we use ints everywhere
|
|
if ((int)hdr_file_size+8 > data_len) return NULL; // the spec says we "may" parse files that have useless trailing bytes, so we do > insted of !=
|
|
|
|
twp__chunk_table chunks;
|
|
if (!twp__read_chunks(u8data, data_len, &chunks))
|
|
return 0;
|
|
|
|
unsigned char *result = NULL;
|
|
if (chunks.VP8L.data)
|
|
result = twp__read_vp8l(chunks.VP8L.data, chunks.VP8L.size, width, height, format, 0);
|
|
else if (chunks.VP8.data)
|
|
result = twp__read_vp8(chunks.VP8.data, chunks.VP8.size, width, height, format, flags);
|
|
else
|
|
twp__assert(0);
|
|
|
|
if (chunks.ALPH.data && (format == twp_FORMAT_RGBA || format == twp_FORMAT_YUVA)) {
|
|
if (!twp__read_alpha(chunks.ALPH.data, chunks.ALPH.size, *width, *height, result, format == twp_FORMAT_YUVA)) {
|
|
free(result);
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
if (format == twp_FORMAT_YUVA && chunks.VP8.data && !chunks.ALPH.data) {
|
|
// fill with 255 in case we want yuva but didn't have an alpha channel
|
|
unsigned char *a;
|
|
twp_unpack_yuv(result, *width, *height, NULL, NULL, NULL, &a, NULL, NULL, NULL);
|
|
memset(a, 255, *width * *height);
|
|
}
|
|
|
|
return result;
|
|
}
|
|
|
|
twp__STORAGE void twp_unpack_yuv(unsigned char *ptr, int width, int height,
|
|
unsigned char **y, unsigned char **u, unsigned char **v, unsigned char **a,
|
|
int *luma_stride, int *chroma_stride, int *alpha_stride)
|
|
{
|
|
int luma_width = twp__div_round_up(width, 16) * 16;
|
|
int luma_height = twp__div_round_up(height, 16) * 16;
|
|
int chroma_width = luma_width / 2;
|
|
int chroma_height = luma_height / 2;
|
|
int luma_stride_ = 1 + luma_width + 4;
|
|
int chroma_stride_ = 1 + chroma_width;
|
|
if (luma_stride) *luma_stride = luma_stride_;
|
|
if (chroma_stride) *chroma_stride = chroma_stride_;
|
|
if (alpha_stride) *alpha_stride = width;
|
|
|
|
int y_bufsize = luma_stride_ * (luma_height+1);
|
|
int uv_bufsize = chroma_stride_ * (chroma_height+1);
|
|
if (y) *y = ptr + luma_stride_ + 1;
|
|
if (u) *u = ptr + y_bufsize + chroma_stride_ + 1;
|
|
if (v) *v = ptr + y_bufsize + uv_bufsize + chroma_stride_ + 1;
|
|
if (a) *a = ptr + y_bufsize + uv_bufsize*2;
|
|
}
|
|
|
|
#ifdef _WIN32
|
|
#ifdef __cplusplus
|
|
extern "C"
|
|
#endif
|
|
__declspec(dllimport) int __stdcall MultiByteToWideChar(unsigned int cp, unsigned long flags, const char *str, int cbmb, wchar_t *widestr, int cchwide);
|
|
#endif
|
|
|
|
static FILE *twp__fopen(const char *file_path)
|
|
{
|
|
#if defined(_WIN32)
|
|
wchar_t wide_file_path[4096];
|
|
if (!MultiByteToWideChar(65001 /* UTF8 */, 0, file_path, -1, wide_file_path, sizeof(wide_file_path) / sizeof(*wide_file_path)))
|
|
return NULL;
|
|
#if defined(_MSC_VER) && _MSC_VER >= 1400
|
|
FILE *f = NULL;
|
|
if (_wfopen_s(&f, wide_file_path, L"rb") != 0) return NULL;
|
|
else return f;
|
|
#else
|
|
return _wfopen(wide_file_path, L"rb");
|
|
#endif
|
|
#else
|
|
return fopen(file_path, "rb");
|
|
#endif
|
|
}
|
|
|
|
static unsigned char *twp__read_entire_file(const char *file_path, int *size)
|
|
{
|
|
FILE *f = twp__fopen(file_path);
|
|
if (!f) return NULL;
|
|
|
|
fseek(f, 0, SEEK_END);
|
|
*size = ftell(f);
|
|
fseek(f, 0, SEEK_SET);
|
|
void *mem = malloc(*size);
|
|
fread(mem, 1, *size, f);
|
|
fclose(f);
|
|
|
|
return (unsigned char *)mem;
|
|
}
|
|
|
|
twp__STORAGE int twp_get_info_from_memory(void *data, int data_len, int *width, int *height, int *lossless, int *alpha)
|
|
{
|
|
if (data_len < 16)
|
|
return 0;
|
|
|
|
unsigned char *u8data = (unsigned char *)data;
|
|
|
|
if (twp__check_fourcc(u8data + 12, "VP8L")) {
|
|
if (data_len < 25)
|
|
return 0;
|
|
|
|
if (lossless) *lossless = 1;
|
|
if (width) *width = ((int)u8data[21] | (((int)u8data[22] & 0x3f) << 8)) + 1;
|
|
if (height) *height = (((int)u8data[22] >> 6) | ((int)u8data[23] << 2) | (((int)u8data[24] & 0xf) << 10)) + 1;
|
|
if (alpha) *alpha = !!(u8data[24] & 0x10);
|
|
} else if (twp__check_fourcc(u8data + 12, "VP8 ")) {
|
|
if (data_len < 30)
|
|
return 0;
|
|
|
|
if (lossless) *lossless = 0;
|
|
if (width) *width = (int)u8data[26] | (((int)u8data[27] & 0x3f) << 8);
|
|
if (height) *height = (int)u8data[28] | (((int)u8data[29] & 0x3f) << 8);
|
|
if (alpha) *alpha = 0;
|
|
} else if (twp__check_fourcc(u8data + 12, "VP8X")) {
|
|
if (data_len < 30)
|
|
return 0;
|
|
|
|
if (lossless) *lossless = 0;
|
|
if (width) *width = ((int)u8data[24] | ((int)u8data[25] << 8) | ((int)u8data[26] << 16)) + 1;
|
|
if (height) *height = ((int)u8data[27] | ((int)u8data[28] << 8) | ((int)u8data[29] << 16)) + 1;
|
|
if (alpha) *alpha = !!(u8data[20] & 0x10);
|
|
|
|
if (u8data[20] & 0x2) // we don't support animation
|
|
return 0;
|
|
|
|
if (*width > 16383 || *height > 16383) // bigger sizes only possible for animations
|
|
return 0;
|
|
} else {
|
|
return 0;
|
|
}
|
|
|
|
return 1;
|
|
}
|
|
|
|
twp__STORAGE int twp_get_info(const char *file_path, int *width, int *height, int *lossless, int *alpha)
|
|
{
|
|
int ok = 0;
|
|
|
|
FILE *f = twp__fopen(file_path);
|
|
if (!f)
|
|
return 0;
|
|
|
|
unsigned char buf[30];
|
|
int buflen = 0;
|
|
|
|
if (fread(buf, 1, 16, f) != 16)
|
|
goto end;
|
|
|
|
if (twp__check_fourcc(buf + 12, "VP8L")) {
|
|
if (fread(buf+16, 1, 9, f) != 9)
|
|
goto end;
|
|
buflen = 25;
|
|
} else if (twp__check_fourcc(buf + 12, "VP8 ") || twp__check_fourcc(buf + 12, "VP8X")) {
|
|
if (fread(buf+16, 1, 14, f) != 14)
|
|
goto end;
|
|
buflen = 30;
|
|
} else {
|
|
goto end;
|
|
}
|
|
twp__assert(twp__arrlen(buf) >= buflen);
|
|
|
|
ok = twp_get_info_from_memory(buf, buflen, width, height, lossless, alpha);
|
|
|
|
end:
|
|
fclose(f);
|
|
return ok;
|
|
}
|
|
|
|
twp__STORAGE unsigned char *twp_read(const char *file_path, int *width, int *height, twp_format format, twp_flags flags)
|
|
{
|
|
int size = 0;
|
|
unsigned char *data = twp__read_entire_file(file_path, &size);
|
|
if (!data) return NULL;
|
|
unsigned char *result = twp_read_from_memory(data, size, width, height, format, flags);
|
|
free(data);
|
|
return result;
|
|
}
|
|
|
|
#endif
|