4
0
mirror of https://github.com/cwinfo/matterbridge.git synced 2025-07-03 08:27:44 +00:00

Convert .tgs with go libraries (and cgo) (telegram) (#1569)

This commit adds support for go/cgo tgs conversion when building with the -tags `cgo`
The default binaries are still "pure" go and uses the old way of converting.

* Move lottie_convert.py conversion code to its own file

* Add optional libtgsconverter

* Update vendor

* Apply suggestions from code review

* Update bridge/helper/libtgsconverter.go

Co-authored-by: Wim <wim@42.be>
This commit is contained in:
Benau
2021-08-25 04:32:50 +08:00
committed by GitHub
parent d4195deb3a
commit 53cafa9f3d
310 changed files with 121526 additions and 85 deletions

1
vendor/github.com/sizeofint/webpanimation/.gitignore generated vendored Normal file
View File

@ -0,0 +1 @@
.idea

27
vendor/github.com/sizeofint/webpanimation/LICENSE generated vendored Normal file
View File

@ -0,0 +1,27 @@
Copyright (c) 2021, george.imerlishvili@gmail.com. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of {organization}. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

362
vendor/github.com/sizeofint/webpanimation/capi.go generated vendored Normal file
View File

@ -0,0 +1,362 @@
package webpanimation
/*
#cgo CFLAGS: -Wno-pointer-sign -DHAVE_CONFIG_H
#cgo !windows LDFLAGS: -lm
#include "webp_encode.h"
#include "webp_mux.h"
*/
import "C"
import (
"errors"
"unsafe"
)
type WebPMuxError int
const (
WebpMuxAbiVersion = 0x0108
WebpEncoderAbiVersion = 0x020f
)
const (
WebpMuxOk = WebPMuxError(C.WEBP_MUX_OK)
WebpMuxNotFound = WebPMuxError(C.WEBP_MUX_NOT_FOUND)
WebpMuxInvalidArgument = WebPMuxError(C.WEBP_MUX_INVALID_ARGUMENT)
WebpMuxBadData = WebPMuxError(C.WEBP_MUX_BAD_DATA)
WebpMuxMemoryError = WebPMuxError(C.WEBP_MUX_MEMORY_ERROR)
WebpMuxNotEnoughData = WebPMuxError(C.WEBP_MUX_NOT_ENOUGH_DATA)
)
type WebPPicture C.WebPPicture
type WebPAnimEncoder C.WebPAnimEncoder
type WebPAnimEncoderOptions C.WebPAnimEncoderOptions
type WebPData C.WebPData
type WebPMux C.WebPMux
type WebPMuxAnimParams C.WebPMuxAnimParams
type webPConfig struct {
webpConfig *C.WebPConfig
}
type WebPConfig interface {
getRawPointer() *C.WebPConfig
SetLossless(v int)
GetLossless() int
SetMethod(v int)
SetImageHint(v int)
SetTargetSize(v int)
SetTargetPSNR(v float32)
SetSegments(v int)
SetSnsStrength(v int)
SetFilterStrength(v int)
SetFilterSharpness(v int)
SetAutofilter(v int)
SetAlphaCompression(v int)
SetAlphaFiltering(v int)
SetPass(v int)
SetShowCompressed(v int)
SetPreprocessing(v int)
SetPartitions(v int)
SetPartitionLimit(v int)
SetEmulateJpegSize(v int)
SetThreadLevel(v int)
SetLowMemory(v int)
SetNearLossless(v int)
SetExact(v int)
SetUseDeltaPalette(v int)
SetUseSharpYuv(v int)
SetAlphaQuality(v int)
SetFilterType(v int)
SetQuality(v float32)
}
func WebPDataClear(webPData *WebPData) {
C.WebPDataClear((*C.WebPData)(unsafe.Pointer(webPData)))
}
func WebPMuxDelete(webPMux *WebPMux) {
C.WebPMuxDelete((*C.WebPMux)(unsafe.Pointer(webPMux)))
}
func WebPPictureFree(webPPicture *WebPPicture) {
C.WebPPictureFree((*C.WebPPicture)(unsafe.Pointer(webPPicture)))
}
func WebPAnimEncoderDelete(webPAnimEncoder *WebPAnimEncoder) {
C.WebPAnimEncoderDelete((*C.WebPAnimEncoder)(unsafe.Pointer(webPAnimEncoder)))
}
func (wmap *WebPMuxAnimParams) SetBgcolor(v uint32) {
((*C.WebPMuxAnimParams)(wmap)).bgcolor = (C.uint32_t)(v)
}
func (wmap *WebPMuxAnimParams) SetLoopCount(v int) {
(*C.WebPMuxAnimParams)(wmap).loop_count = (C.int)(v)
}
func WebPPictureInit(webPPicture *WebPPicture) int {
return int(C.WebPPictureInit((*C.WebPPicture)(unsafe.Pointer(webPPicture))))
}
func (wpp *WebPPicture) SetWidth(v int) {
((*C.WebPPicture)(wpp)).width = (C.int)(v)
}
func (wpp *WebPPicture) SetHeight(v int) {
((*C.WebPPicture)(wpp)).height = (C.int)(v)
}
func (wpp WebPPicture) GetWidth() int {
return int(((C.WebPPicture)(wpp)).width)
}
func (wpp WebPPicture) GetHeight() int {
return int(((C.WebPPicture)(wpp)).height)
}
func (wpp *WebPPicture) SetUseArgb(v int) {
((*C.WebPPicture)(wpp)).use_argb = (C.int)(v)
}
func (wpd WebPData) GetBytes() []byte {
return C.GoBytes(unsafe.Pointer(((C.WebPData)(wpd)).bytes), (C.int)(((C.WebPData)(wpd)).size))
}
func WebPDataInit(webPData *WebPData) {
C.WebPDataInit((*C.WebPData)(unsafe.Pointer(webPData)))
}
// NewWebpConfig create webpconfig instance
func NewWebpConfig() WebPConfig {
webpcfg := &webPConfig{}
webpcfg.webpConfig = &C.WebPConfig{}
WebPConfigInitInternal(webpcfg)
return webpcfg
}
func WebPConfigInitInternal(config WebPConfig) int {
return int(C.WebPConfigInitInternal(
config.getRawPointer(),
(C.WebPPreset)(0),
(C.float)(75.0),
(C.int)(WebpEncoderAbiVersion),
))
}
func (webpCfg *webPConfig) getRawPointer() *C.WebPConfig {
return webpCfg.webpConfig
}
func (webpCfg *webPConfig) SetLossless(v int) {
webpCfg.webpConfig.lossless = (C.int)(v)
}
func (webpCfg *webPConfig) GetLossless() int {
return int(webpCfg.webpConfig.lossless)
}
func (webpCfg *webPConfig) SetMethod(v int) {
webpCfg.webpConfig.method = (C.int)(v)
}
func (webpCfg *webPConfig) SetImageHint(v int) {
webpCfg.webpConfig.image_hint = (C.WebPImageHint)(v)
}
func (webpCfg *webPConfig) SetTargetSize(v int) {
webpCfg.webpConfig.target_size = (C.int)(v)
}
func (webpCfg *webPConfig) SetTargetPSNR(v float32) {
webpCfg.webpConfig.target_PSNR = (C.float)(v)
}
func (webpCfg *webPConfig) SetSegments(v int) {
webpCfg.webpConfig.segments = (C.int)(v)
}
func (webpCfg *webPConfig) SetSnsStrength(v int) {
webpCfg.webpConfig.sns_strength = (C.int)(v)
}
func (webpCfg *webPConfig) SetFilterStrength(v int) {
webpCfg.webpConfig.filter_strength = (C.int)(v)
}
func (webpCfg *webPConfig) SetFilterSharpness(v int) {
webpCfg.webpConfig.filter_sharpness = (C.int)(v)
}
func (webpCfg *webPConfig) SetAutofilter(v int) {
webpCfg.webpConfig.autofilter = (C.int)(v)
}
func (webpCfg *webPConfig) SetAlphaCompression(v int) {
webpCfg.webpConfig.alpha_compression = (C.int)(v)
}
func (webpCfg *webPConfig) SetAlphaFiltering(v int) {
webpCfg.webpConfig.alpha_filtering = (C.int)(v)
}
func (webpCfg *webPConfig) SetPass(v int) {
webpCfg.webpConfig.pass = (C.int)(v)
}
func (webpCfg *webPConfig) SetShowCompressed(v int) {
webpCfg.webpConfig.show_compressed = (C.int)(v)
}
func (webpCfg *webPConfig) SetPreprocessing(v int) {
webpCfg.webpConfig.preprocessing = (C.int)(v)
}
func (webpCfg *webPConfig) SetPartitions(v int) {
webpCfg.webpConfig.partitions = (C.int)(v)
}
func (webpCfg *webPConfig) SetPartitionLimit(v int) {
webpCfg.webpConfig.partition_limit = (C.int)(v)
}
func (webpCfg *webPConfig) SetEmulateJpegSize(v int) {
webpCfg.webpConfig.emulate_jpeg_size = (C.int)(v)
}
func (webpCfg *webPConfig) SetThreadLevel(v int) {
webpCfg.webpConfig.thread_level = (C.int)(v)
}
func (webpCfg *webPConfig) SetLowMemory(v int) {
webpCfg.webpConfig.low_memory = (C.int)(v)
}
func (webpCfg *webPConfig) SetNearLossless(v int) {
webpCfg.webpConfig.near_lossless = (C.int)(v)
}
func (webpCfg *webPConfig) SetExact(v int) {
webpCfg.webpConfig.exact = (C.int)(v)
}
func (webpCfg *webPConfig) SetUseDeltaPalette(v int) {
webpCfg.webpConfig.use_delta_palette = (C.int)(v)
}
func (webpCfg *webPConfig) SetUseSharpYuv(v int) {
webpCfg.webpConfig.use_sharp_yuv = (C.int)(v)
}
func (webpCfg *webPConfig) SetAlphaQuality(v int) {
webpCfg.webpConfig.alpha_quality = (C.int)(v)
}
func (webpCfg *webPConfig) SetFilterType(v int) {
webpCfg.webpConfig.filter_type = (C.int)(v)
}
func (webpCfg *webPConfig) SetQuality(v float32) {
webpCfg.webpConfig.quality = (C.float)(v)
}
func (encOptions *WebPAnimEncoderOptions) GetAnimParams() WebPMuxAnimParams {
return WebPMuxAnimParams(((*C.WebPAnimEncoderOptions)(encOptions)).anim_params)
}
func (encOptions *WebPAnimEncoderOptions) SetAnimParams(v WebPMuxAnimParams) {
((*C.WebPAnimEncoderOptions)(encOptions)).anim_params = (C.WebPMuxAnimParams)(v)
}
func (encOptions *WebPAnimEncoderOptions) SetMinimizeSize(v int) {
((*C.WebPAnimEncoderOptions)(encOptions)).minimize_size = (C.int)(v)
}
func (encOptions *WebPAnimEncoderOptions) SetKmin(v int) {
((*C.WebPAnimEncoderOptions)(encOptions)).kmin = (C.int)(v)
}
func (encOptions *WebPAnimEncoderOptions) SetKmax(v int) {
((*C.WebPAnimEncoderOptions)(encOptions)).kmax = (C.int)(v)
}
func (encOptions *WebPAnimEncoderOptions) SetAllowMixed(v int) {
((*C.WebPAnimEncoderOptions)(encOptions)).allow_mixed = (C.int)(v)
}
func (encOptions *WebPAnimEncoderOptions) SetVerbose(v int) {
((*C.WebPAnimEncoderOptions)(encOptions)).verbose = (C.int)(v)
}
func WebPAnimEncoderOptionsInitInternal(webPAnimEncoderOptions *WebPAnimEncoderOptions) int {
return int(C.WebPAnimEncoderOptionsInitInternal(
(*C.WebPAnimEncoderOptions)(unsafe.Pointer(webPAnimEncoderOptions)),
(C.int)(WebpMuxAbiVersion),
))
}
func WebPPictureImportRGBA(data []byte, stride int, webPPicture *WebPPicture) error {
res := int(C.WebPPictureImportRGBA(
(*C.WebPPicture)(unsafe.Pointer(webPPicture)),
(*C.uint8_t)(unsafe.Pointer(&data[0])),
(C.int)(stride),
))
if res == 0 {
return errors.New("error: WebPPictureImportBGRA")
}
return nil
}
func WebPAnimEncoderNewInternal(width, height int, webPAnimEncoderOptions *WebPAnimEncoderOptions) *WebPAnimEncoder {
return (*WebPAnimEncoder)(C.WebPAnimEncoderNewInternal(
(C.int)(width),
(C.int)(height),
(*C.WebPAnimEncoderOptions)(unsafe.Pointer(webPAnimEncoderOptions)),
(C.int)(WebpMuxAbiVersion),
))
}
func WebPAnimEncoderAdd(webPAnimEncoder *WebPAnimEncoder, webPPicture *WebPPicture, timestamp int, webpcfg WebPConfig) int {
return int(C.WebPAnimEncoderAdd(
(*C.WebPAnimEncoder)(unsafe.Pointer(webPAnimEncoder)),
(*C.WebPPicture)(unsafe.Pointer(webPPicture)),
(C.int)(timestamp),
webpcfg.getRawPointer(),
))
}
func WebPAnimEncoderAssemble(webPAnimEncoder *WebPAnimEncoder, webPData *WebPData) int {
return int(C.WebPAnimEncoderAssemble(
(*C.WebPAnimEncoder)(unsafe.Pointer(webPAnimEncoder)),
(*C.WebPData)(unsafe.Pointer(webPData)),
))
}
func WebPMuxCreateInternal(webPData *WebPData, copyData int) *WebPMux {
return (*WebPMux)(C.WebPMuxCreateInternal(
(*C.WebPData)(unsafe.Pointer(webPData)),
(C.int)(copyData),
(C.int)(WebpMuxAbiVersion),
))
}
func WebPMuxSetAnimationParams(webPMux *WebPMux, webPMuxAnimParams *WebPMuxAnimParams) WebPMuxError {
return (WebPMuxError)(C.WebPMuxSetAnimationParams(
(*C.WebPMux)(unsafe.Pointer(webPMux)),
(*C.WebPMuxAnimParams)(unsafe.Pointer(webPMuxAnimParams)),
))
}
func WebPMuxGetAnimationParams(webPMux *WebPMux, webPMuxAnimParams *WebPMuxAnimParams) WebPMuxError {
return (WebPMuxError)(C.WebPMuxGetAnimationParams(
(*C.WebPMux)(unsafe.Pointer(webPMux)),
(*C.WebPMuxAnimParams)(unsafe.Pointer(webPMuxAnimParams)),
))
}
func WebPMuxAssemble(webPMux *WebPMux, webPData *WebPData) WebPMuxError {
return (WebPMuxError)(C.WebPMuxAssemble(
(*C.WebPMux)(unsafe.Pointer(webPMux)),
(*C.WebPData)(unsafe.Pointer(webPData)),
))
}

4
vendor/github.com/sizeofint/webpanimation/config.h generated vendored Normal file
View File

@ -0,0 +1,4 @@
#ifndef WEBANIMATION_HPP
#define WEBANIMATION_HPP
#define WEBP_USE_THREAD
#endif

View File

@ -0,0 +1,232 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Alpha-plane decompression.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "dec_alphai_dec.h"
#include "dec_vp8i_dec.h"
#include "dec_vp8li_dec.h"
#include "dsp_dsp.h"
#include "utils_quant_levels_dec_utils.h"
#include "utils_utils.h"
#include "webp_format_constants.h"
//------------------------------------------------------------------------------
// ALPHDecoder object.
// Allocates a new alpha decoder instance.
static ALPHDecoder* ALPHNew(void) {
ALPHDecoder* const dec = (ALPHDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
return dec;
}
// Clears and deallocates an alpha decoder instance.
static void ALPHDelete(ALPHDecoder* const dec) {
if (dec != NULL) {
VP8LDelete(dec->vp8l_dec_);
dec->vp8l_dec_ = NULL;
WebPSafeFree(dec);
}
}
//------------------------------------------------------------------------------
// Decoding.
// Initialize alpha decoding by parsing the alpha header and decoding the image
// header for alpha data stored using lossless compression.
// Returns false in case of error in alpha header (data too short, invalid
// compression method or filter, error in lossless header data etc).
static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
size_t data_size, const VP8Io* const src_io,
uint8_t* output) {
int ok = 0;
const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
int rsrv;
VP8Io* const io = &dec->io_;
assert(data != NULL && output != NULL && src_io != NULL);
VP8FiltersInit();
dec->output_ = output;
dec->width_ = src_io->width;
dec->height_ = src_io->height;
assert(dec->width_ > 0 && dec->height_ > 0);
if (data_size <= ALPHA_HEADER_LEN) {
return 0;
}
dec->method_ = (data[0] >> 0) & 0x03;
dec->filter_ = (WEBP_FILTER_TYPE)((data[0] >> 2) & 0x03);
dec->pre_processing_ = (data[0] >> 4) & 0x03;
rsrv = (data[0] >> 6) & 0x03;
if (dec->method_ < ALPHA_NO_COMPRESSION ||
dec->method_ > ALPHA_LOSSLESS_COMPRESSION ||
dec->filter_ >= WEBP_FILTER_LAST ||
dec->pre_processing_ > ALPHA_PREPROCESSED_LEVELS ||
rsrv != 0) {
return 0;
}
// Copy the necessary parameters from src_io to io
VP8InitIo(io);
WebPInitCustomIo(NULL, io);
io->opaque = dec;
io->width = src_io->width;
io->height = src_io->height;
io->use_cropping = src_io->use_cropping;
io->crop_left = src_io->crop_left;
io->crop_right = src_io->crop_right;
io->crop_top = src_io->crop_top;
io->crop_bottom = src_io->crop_bottom;
// No need to copy the scaling parameters.
if (dec->method_ == ALPHA_NO_COMPRESSION) {
const size_t alpha_decoded_size = dec->width_ * dec->height_;
ok = (alpha_data_size >= alpha_decoded_size);
} else {
assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size);
}
return ok;
}
// Decodes, unfilters and dequantizes *at least* 'num_rows' rows of alpha
// starting from row number 'row'. It assumes that rows up to (row - 1) have
// already been decoded.
// Returns false in case of bitstream error.
static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
ALPHDecoder* const alph_dec = dec->alph_dec_;
const int width = alph_dec->width_;
const int height = alph_dec->io_.crop_bottom;
if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
int y;
const uint8_t* prev_line = dec->alpha_prev_line_;
const uint8_t* deltas = dec->alpha_data_ + ALPHA_HEADER_LEN + row * width;
uint8_t* dst = dec->alpha_plane_ + row * width;
assert(deltas <= &dec->alpha_data_[dec->alpha_data_size_]);
if (alph_dec->filter_ != WEBP_FILTER_NONE) {
assert(WebPUnfilters[alph_dec->filter_] != NULL);
for (y = 0; y < num_rows; ++y) {
WebPUnfilters[alph_dec->filter_](prev_line, deltas, dst, width);
prev_line = dst;
dst += width;
deltas += width;
}
} else {
for (y = 0; y < num_rows; ++y) {
memcpy(dst, deltas, width * sizeof(*dst));
prev_line = dst;
dst += width;
deltas += width;
}
}
dec->alpha_prev_line_ = prev_line;
} else { // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
assert(alph_dec->vp8l_dec_ != NULL);
if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
return 0;
}
}
if (row + num_rows >= height) {
dec->is_alpha_decoded_ = 1;
}
return 1;
}
static int AllocateAlphaPlane(VP8Decoder* const dec, const VP8Io* const io) {
const int stride = io->width;
const int height = io->crop_bottom;
const uint64_t alpha_size = (uint64_t)stride * height;
assert(dec->alpha_plane_mem_ == NULL);
dec->alpha_plane_mem_ =
(uint8_t*)WebPSafeMalloc(alpha_size, sizeof(*dec->alpha_plane_));
if (dec->alpha_plane_mem_ == NULL) {
return 0;
}
dec->alpha_plane_ = dec->alpha_plane_mem_;
dec->alpha_prev_line_ = NULL;
return 1;
}
void WebPDeallocateAlphaMemory(VP8Decoder* const dec) {
assert(dec != NULL);
WebPSafeFree(dec->alpha_plane_mem_);
dec->alpha_plane_mem_ = NULL;
dec->alpha_plane_ = NULL;
ALPHDelete(dec->alph_dec_);
dec->alph_dec_ = NULL;
}
//------------------------------------------------------------------------------
// Main entry point.
const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
const VP8Io* const io,
int row, int num_rows) {
const int width = io->width;
const int height = io->crop_bottom;
assert(dec != NULL && io != NULL);
if (row < 0 || num_rows <= 0 || row + num_rows > height) {
return NULL; // sanity check.
}
if (!dec->is_alpha_decoded_) {
if (dec->alph_dec_ == NULL) { // Initialize decoder.
dec->alph_dec_ = ALPHNew();
if (dec->alph_dec_ == NULL) return NULL;
if (!AllocateAlphaPlane(dec, io)) goto Error;
if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
io, dec->alpha_plane_)) {
goto Error;
}
// if we allowed use of alpha dithering, check whether it's needed at all
if (dec->alph_dec_->pre_processing_ != ALPHA_PREPROCESSED_LEVELS) {
dec->alpha_dithering_ = 0; // disable dithering
} else {
num_rows = height - row; // decode everything in one pass
}
}
assert(dec->alph_dec_ != NULL);
assert(row + num_rows <= height);
if (!ALPHDecode(dec, row, num_rows)) goto Error;
if (dec->is_alpha_decoded_) { // finished?
ALPHDelete(dec->alph_dec_);
dec->alph_dec_ = NULL;
if (dec->alpha_dithering_ > 0) {
uint8_t* const alpha = dec->alpha_plane_ + io->crop_top * width
+ io->crop_left;
if (!WebPDequantizeLevels(alpha,
io->crop_right - io->crop_left,
io->crop_bottom - io->crop_top,
width, dec->alpha_dithering_)) {
goto Error;
}
}
}
}
// Return a pointer to the current decoded row.
return dec->alpha_plane_ + row * width;
Error:
WebPDeallocateAlphaMemory(dec);
return NULL;
}

View File

@ -0,0 +1,54 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Alpha decoder: internal header.
//
// Author: Urvang (urvang@google.com)
#ifndef WEBP_DEC_ALPHAI_DEC_H_
#define WEBP_DEC_ALPHAI_DEC_H_
#include "dec_webpi_dec.h"
#include "utils_filters_utils.h"
#ifdef __cplusplus
extern "C" {
#endif
struct VP8LDecoder; // Defined in dec/vp8li.h.
typedef struct ALPHDecoder ALPHDecoder;
struct ALPHDecoder {
int width_;
int height_;
int method_;
WEBP_FILTER_TYPE filter_;
int pre_processing_;
struct VP8LDecoder* vp8l_dec_;
VP8Io io_;
int use_8b_decode_; // Although alpha channel requires only 1 byte per
// pixel, sometimes VP8LDecoder may need to allocate
// 4 bytes per pixel internally during decode.
uint8_t* output_;
const uint8_t* prev_line_; // last output row (or NULL)
};
//------------------------------------------------------------------------------
// internal functions. Not public.
// Deallocate memory associated to dec->alpha_plane_ decoding
void WebPDeallocateAlphaMemory(VP8Decoder* const dec);
//------------------------------------------------------------------------------
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DEC_ALPHAI_DEC_H_

View File

@ -0,0 +1,312 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Everything about WebPDecBuffer
//
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "dec_vp8i_dec.h"
#include "dec_webpi_dec.h"
#include "utils_utils.h"
//------------------------------------------------------------------------------
// WebPDecBuffer
// Number of bytes per pixel for the different color-spaces.
static const uint8_t kModeBpp[MODE_LAST] = {
3, 4, 3, 4, 4, 2, 2,
4, 4, 4, 2, // pre-multiplied modes
1, 1 };
// Check that webp_csp_mode is within the bounds of WEBP_CSP_MODE.
// Convert to an integer to handle both the unsigned/signed enum cases
// without the need for casting to remove type limit warnings.
static int IsValidColorspace(int webp_csp_mode) {
return (webp_csp_mode >= MODE_RGB && webp_csp_mode < MODE_LAST);
}
// strictly speaking, the very last (or first, if flipped) row
// doesn't require padding.
#define MIN_BUFFER_SIZE(WIDTH, HEIGHT, STRIDE) \
((uint64_t)(STRIDE) * ((HEIGHT) - 1) + (WIDTH))
static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) {
int ok = 1;
const WEBP_CSP_MODE mode = buffer->colorspace;
const int width = buffer->width;
const int height = buffer->height;
if (!IsValidColorspace(mode)) {
ok = 0;
} else if (!WebPIsRGBMode(mode)) { // YUV checks
const WebPYUVABuffer* const buf = &buffer->u.YUVA;
const int uv_width = (width + 1) / 2;
const int uv_height = (height + 1) / 2;
const int y_stride = abs(buf->y_stride);
const int u_stride = abs(buf->u_stride);
const int v_stride = abs(buf->v_stride);
const int a_stride = abs(buf->a_stride);
const uint64_t y_size = MIN_BUFFER_SIZE(width, height, y_stride);
const uint64_t u_size = MIN_BUFFER_SIZE(uv_width, uv_height, u_stride);
const uint64_t v_size = MIN_BUFFER_SIZE(uv_width, uv_height, v_stride);
const uint64_t a_size = MIN_BUFFER_SIZE(width, height, a_stride);
ok &= (y_size <= buf->y_size);
ok &= (u_size <= buf->u_size);
ok &= (v_size <= buf->v_size);
ok &= (y_stride >= width);
ok &= (u_stride >= uv_width);
ok &= (v_stride >= uv_width);
ok &= (buf->y != NULL);
ok &= (buf->u != NULL);
ok &= (buf->v != NULL);
if (mode == MODE_YUVA) {
ok &= (a_stride >= width);
ok &= (a_size <= buf->a_size);
ok &= (buf->a != NULL);
}
} else { // RGB checks
const WebPRGBABuffer* const buf = &buffer->u.RGBA;
const int stride = abs(buf->stride);
const uint64_t size =
MIN_BUFFER_SIZE(width * kModeBpp[mode], height, stride);
ok &= (size <= buf->size);
ok &= (stride >= width * kModeBpp[mode]);
ok &= (buf->rgba != NULL);
}
return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM;
}
#undef MIN_BUFFER_SIZE
static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) {
const int w = buffer->width;
const int h = buffer->height;
const WEBP_CSP_MODE mode = buffer->colorspace;
if (w <= 0 || h <= 0 || !IsValidColorspace(mode)) {
return VP8_STATUS_INVALID_PARAM;
}
if (buffer->is_external_memory <= 0 && buffer->private_memory == NULL) {
uint8_t* output;
int uv_stride = 0, a_stride = 0;
uint64_t uv_size = 0, a_size = 0, total_size;
// We need memory and it hasn't been allocated yet.
// => initialize output buffer, now that dimensions are known.
int stride;
uint64_t size;
if ((uint64_t)w * kModeBpp[mode] >= (1ull << 32)) {
return VP8_STATUS_INVALID_PARAM;
}
stride = w * kModeBpp[mode];
size = (uint64_t)stride * h;
if (!WebPIsRGBMode(mode)) {
uv_stride = (w + 1) / 2;
uv_size = (uint64_t)uv_stride * ((h + 1) / 2);
if (mode == MODE_YUVA) {
a_stride = w;
a_size = (uint64_t)a_stride * h;
}
}
total_size = size + 2 * uv_size + a_size;
// Security/sanity checks
output = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*output));
if (output == NULL) {
return VP8_STATUS_OUT_OF_MEMORY;
}
buffer->private_memory = output;
if (!WebPIsRGBMode(mode)) { // YUVA initialization
WebPYUVABuffer* const buf = &buffer->u.YUVA;
buf->y = output;
buf->y_stride = stride;
buf->y_size = (size_t)size;
buf->u = output + size;
buf->u_stride = uv_stride;
buf->u_size = (size_t)uv_size;
buf->v = output + size + uv_size;
buf->v_stride = uv_stride;
buf->v_size = (size_t)uv_size;
if (mode == MODE_YUVA) {
buf->a = output + size + 2 * uv_size;
}
buf->a_size = (size_t)a_size;
buf->a_stride = a_stride;
} else { // RGBA initialization
WebPRGBABuffer* const buf = &buffer->u.RGBA;
buf->rgba = output;
buf->stride = stride;
buf->size = (size_t)size;
}
}
return CheckDecBuffer(buffer);
}
VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer) {
if (buffer == NULL) {
return VP8_STATUS_INVALID_PARAM;
}
if (WebPIsRGBMode(buffer->colorspace)) {
WebPRGBABuffer* const buf = &buffer->u.RGBA;
buf->rgba += (buffer->height - 1) * buf->stride;
buf->stride = -buf->stride;
} else {
WebPYUVABuffer* const buf = &buffer->u.YUVA;
const int H = buffer->height;
buf->y += (H - 1) * buf->y_stride;
buf->y_stride = -buf->y_stride;
buf->u += ((H - 1) >> 1) * buf->u_stride;
buf->u_stride = -buf->u_stride;
buf->v += ((H - 1) >> 1) * buf->v_stride;
buf->v_stride = -buf->v_stride;
if (buf->a != NULL) {
buf->a += (H - 1) * buf->a_stride;
buf->a_stride = -buf->a_stride;
}
}
return VP8_STATUS_OK;
}
VP8StatusCode WebPAllocateDecBuffer(int width, int height,
const WebPDecoderOptions* const options,
WebPDecBuffer* const buffer) {
VP8StatusCode status;
if (buffer == NULL || width <= 0 || height <= 0) {
return VP8_STATUS_INVALID_PARAM;
}
if (options != NULL) { // First, apply options if there is any.
if (options->use_cropping) {
const int cw = options->crop_width;
const int ch = options->crop_height;
const int x = options->crop_left & ~1;
const int y = options->crop_top & ~1;
if (x < 0 || y < 0 || cw <= 0 || ch <= 0 ||
x + cw > width || y + ch > height) {
return VP8_STATUS_INVALID_PARAM; // out of frame boundary.
}
width = cw;
height = ch;
}
if (options->use_scaling) {
#if !defined(WEBP_REDUCE_SIZE)
int scaled_width = options->scaled_width;
int scaled_height = options->scaled_height;
if (!WebPRescalerGetScaledDimensions(
width, height, &scaled_width, &scaled_height)) {
return VP8_STATUS_INVALID_PARAM;
}
width = scaled_width;
height = scaled_height;
#else
return VP8_STATUS_INVALID_PARAM; // rescaling not supported
#endif
}
}
buffer->width = width;
buffer->height = height;
// Then, allocate buffer for real.
status = AllocateBuffer(buffer);
if (status != VP8_STATUS_OK) return status;
// Use the stride trick if vertical flip is needed.
if (options != NULL && options->flip) {
status = WebPFlipBuffer(buffer);
}
return status;
}
//------------------------------------------------------------------------------
// constructors / destructors
int WebPInitDecBufferInternal(WebPDecBuffer* buffer, int version) {
if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
return 0; // version mismatch
}
if (buffer == NULL) return 0;
memset(buffer, 0, sizeof(*buffer));
return 1;
}
void WebPFreeDecBuffer(WebPDecBuffer* buffer) {
if (buffer != NULL) {
if (buffer->is_external_memory <= 0) {
WebPSafeFree(buffer->private_memory);
}
buffer->private_memory = NULL;
}
}
void WebPCopyDecBuffer(const WebPDecBuffer* const src,
WebPDecBuffer* const dst) {
if (src != NULL && dst != NULL) {
*dst = *src;
if (src->private_memory != NULL) {
dst->is_external_memory = 1; // dst buffer doesn't own the memory.
dst->private_memory = NULL;
}
}
}
// Copy and transfer ownership from src to dst (beware of parameter order!)
void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
if (src != NULL && dst != NULL) {
*dst = *src;
if (src->private_memory != NULL) {
src->is_external_memory = 1; // src relinquishes ownership
src->private_memory = NULL;
}
}
}
VP8StatusCode WebPCopyDecBufferPixels(const WebPDecBuffer* const src_buf,
WebPDecBuffer* const dst_buf) {
assert(src_buf != NULL && dst_buf != NULL);
assert(src_buf->colorspace == dst_buf->colorspace);
dst_buf->width = src_buf->width;
dst_buf->height = src_buf->height;
if (CheckDecBuffer(dst_buf) != VP8_STATUS_OK) {
return VP8_STATUS_INVALID_PARAM;
}
if (WebPIsRGBMode(src_buf->colorspace)) {
const WebPRGBABuffer* const src = &src_buf->u.RGBA;
const WebPRGBABuffer* const dst = &dst_buf->u.RGBA;
WebPCopyPlane(src->rgba, src->stride, dst->rgba, dst->stride,
src_buf->width * kModeBpp[src_buf->colorspace],
src_buf->height);
} else {
const WebPYUVABuffer* const src = &src_buf->u.YUVA;
const WebPYUVABuffer* const dst = &dst_buf->u.YUVA;
WebPCopyPlane(src->y, src->y_stride, dst->y, dst->y_stride,
src_buf->width, src_buf->height);
WebPCopyPlane(src->u, src->u_stride, dst->u, dst->u_stride,
(src_buf->width + 1) / 2, (src_buf->height + 1) / 2);
WebPCopyPlane(src->v, src->v_stride, dst->v, dst->v_stride,
(src_buf->width + 1) / 2, (src_buf->height + 1) / 2);
if (WebPIsAlphaMode(src_buf->colorspace)) {
WebPCopyPlane(src->a, src->a_stride, dst->a, dst->a_stride,
src_buf->width, src_buf->height);
}
}
return VP8_STATUS_OK;
}
int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
const WebPBitstreamFeatures* const features) {
assert(output != NULL);
return (output->is_external_memory >= 2) &&
WebPIsPremultipliedMode(output->colorspace) &&
(features != NULL && features->has_alpha);
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,54 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Definitions and macros common to encoding and decoding
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DEC_COMMON_DEC_H_
#define WEBP_DEC_COMMON_DEC_H_
// intra prediction modes
enum { B_DC_PRED = 0, // 4x4 modes
B_TM_PRED = 1,
B_VE_PRED = 2,
B_HE_PRED = 3,
B_RD_PRED = 4,
B_VR_PRED = 5,
B_LD_PRED = 6,
B_VL_PRED = 7,
B_HD_PRED = 8,
B_HU_PRED = 9,
NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED, // = 10
// Luma16 or UV modes
DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
B_PRED = NUM_BMODES, // refined I4x4 mode
NUM_PRED_MODES = 4,
// special modes
B_DC_PRED_NOTOP = 4,
B_DC_PRED_NOLEFT = 5,
B_DC_PRED_NOTOPLEFT = 6,
NUM_B_DC_MODES = 7 };
enum { MB_FEATURE_TREE_PROBS = 3,
NUM_MB_SEGMENTS = 4,
NUM_REF_LF_DELTAS = 4,
NUM_MODE_LF_DELTAS = 4, // I4x4, ZERO, *, SPLIT
MAX_NUM_PARTITIONS = 8,
// Probabilities
NUM_TYPES = 4, // 0: i16-AC, 1: i16-DC, 2:chroma-AC, 3:i4-AC
NUM_BANDS = 8,
NUM_CTX = 3,
NUM_PROBAS = 11
};
#endif // WEBP_DEC_COMMON_DEC_H_

View File

@ -0,0 +1,803 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Frame-reconstruction function. Memory allocation.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "dec_vp8i_dec.h"
#include "utils_utils.h"
//------------------------------------------------------------------------------
// Main reconstruction function.
static const uint16_t kScan[16] = {
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
};
static int CheckMode(int mb_x, int mb_y, int mode) {
if (mode == B_DC_PRED) {
if (mb_x == 0) {
return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
} else {
return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
}
}
return mode;
}
static void Copy32b(uint8_t* const dst, const uint8_t* const src) {
memcpy(dst, src, 4);
}
static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
uint8_t* const dst) {
switch (bits >> 30) {
case 3:
VP8Transform(src, dst, 0);
break;
case 2:
VP8TransformAC3(src, dst);
break;
case 1:
VP8TransformDC(src, dst);
break;
default:
break;
}
}
static void DoUVTransform(uint32_t bits, const int16_t* const src,
uint8_t* const dst) {
if (bits & 0xff) { // any non-zero coeff at all?
if (bits & 0xaa) { // any non-zero AC coefficient?
VP8TransformUV(src, dst); // note we don't use the AC3 variant for U/V
} else {
VP8TransformDCUV(src, dst);
}
}
}
static void ReconstructRow(const VP8Decoder* const dec,
const VP8ThreadContext* ctx) {
int j;
int mb_x;
const int mb_y = ctx->mb_y_;
const int cache_id = ctx->id_;
uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
// Initialize left-most block.
for (j = 0; j < 16; ++j) {
y_dst[j * BPS - 1] = 129;
}
for (j = 0; j < 8; ++j) {
u_dst[j * BPS - 1] = 129;
v_dst[j * BPS - 1] = 129;
}
// Init top-left sample on left column too.
if (mb_y > 0) {
y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
} else {
// we only need to do this init once at block (0,0).
// Afterward, it remains valid for the whole topmost row.
memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
memset(u_dst - BPS - 1, 127, 8 + 1);
memset(v_dst - BPS - 1, 127, 8 + 1);
}
// Reconstruct one row.
for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
const VP8MBData* const block = ctx->mb_data_ + mb_x;
// Rotate in the left samples from previously decoded block. We move four
// pixels at a time for alignment reason, and because of in-loop filter.
if (mb_x > 0) {
for (j = -1; j < 16; ++j) {
Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
}
for (j = -1; j < 8; ++j) {
Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
}
}
{
// bring top samples into the cache
VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
const int16_t* const coeffs = block->coeffs_;
uint32_t bits = block->non_zero_y_;
int n;
if (mb_y > 0) {
memcpy(y_dst - BPS, top_yuv[0].y, 16);
memcpy(u_dst - BPS, top_yuv[0].u, 8);
memcpy(v_dst - BPS, top_yuv[0].v, 8);
}
// predict and add residuals
if (block->is_i4x4_) { // 4x4
uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
if (mb_y > 0) {
if (mb_x >= dec->mb_w_ - 1) { // on rightmost border
memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
} else {
memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
}
}
// replicate the top-right pixels below
top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
// predict and add residuals for all 4x4 blocks in turn.
for (n = 0; n < 16; ++n, bits <<= 2) {
uint8_t* const dst = y_dst + kScan[n];
VP8PredLuma4[block->imodes_[n]](dst);
DoTransform(bits, coeffs + n * 16, dst);
}
} else { // 16x16
const int pred_func = CheckMode(mb_x, mb_y, block->imodes_[0]);
VP8PredLuma16[pred_func](y_dst);
if (bits != 0) {
for (n = 0; n < 16; ++n, bits <<= 2) {
DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
}
}
}
{
// Chroma
const uint32_t bits_uv = block->non_zero_uv_;
const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
VP8PredChroma8[pred_func](u_dst);
VP8PredChroma8[pred_func](v_dst);
DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
}
// stash away top samples for next block
if (mb_y < dec->mb_h_ - 1) {
memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
memcpy(top_yuv[0].u, u_dst + 7 * BPS, 8);
memcpy(top_yuv[0].v, v_dst + 7 * BPS, 8);
}
}
// Transfer reconstructed samples from yuv_b_ cache to final destination.
{
const int y_offset = cache_id * 16 * dec->cache_y_stride_;
const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
for (j = 0; j < 16; ++j) {
memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
}
for (j = 0; j < 8; ++j) {
memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
}
}
}
}
//------------------------------------------------------------------------------
// Filtering
// kFilterExtraRows[] = How many extra lines are needed on the MB boundary
// for caching, given a filtering level.
// Simple filter: up to 2 luma samples are read and 1 is written.
// Complex filter: up to 4 luma samples are read and 3 are written. Same for
// U/V, so it's 8 samples total (because of the 2x upsampling).
static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
const VP8ThreadContext* const ctx = &dec->thread_ctx_;
const int cache_id = ctx->id_;
const int y_bps = dec->cache_y_stride_;
const VP8FInfo* const f_info = ctx->f_info_ + mb_x;
uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
const int ilevel = f_info->f_ilevel_;
const int limit = f_info->f_limit_;
if (limit == 0) {
return;
}
assert(limit >= 3);
if (dec->filter_type_ == 1) { // simple
if (mb_x > 0) {
VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
}
if (f_info->f_inner_) {
VP8SimpleHFilter16i(y_dst, y_bps, limit);
}
if (mb_y > 0) {
VP8SimpleVFilter16(y_dst, y_bps, limit + 4);
}
if (f_info->f_inner_) {
VP8SimpleVFilter16i(y_dst, y_bps, limit);
}
} else { // complex
const int uv_bps = dec->cache_uv_stride_;
uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
const int hev_thresh = f_info->hev_thresh_;
if (mb_x > 0) {
VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
}
if (f_info->f_inner_) {
VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
}
if (mb_y > 0) {
VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
}
if (f_info->f_inner_) {
VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh);
VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh);
}
}
}
// Filter the decoded macroblock row (if needed)
static void FilterRow(const VP8Decoder* const dec) {
int mb_x;
const int mb_y = dec->thread_ctx_.mb_y_;
assert(dec->thread_ctx_.filter_row_);
for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
DoFilter(dec, mb_x, mb_y);
}
}
//------------------------------------------------------------------------------
// Precompute the filtering strength for each segment and each i4x4/i16x16 mode.
static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
if (dec->filter_type_ > 0) {
int s;
const VP8FilterHeader* const hdr = &dec->filter_hdr_;
for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
int i4x4;
// First, compute the initial level
int base_level;
if (dec->segment_hdr_.use_segment_) {
base_level = dec->segment_hdr_.filter_strength_[s];
if (!dec->segment_hdr_.absolute_delta_) {
base_level += hdr->level_;
}
} else {
base_level = hdr->level_;
}
for (i4x4 = 0; i4x4 <= 1; ++i4x4) {
VP8FInfo* const info = &dec->fstrengths_[s][i4x4];
int level = base_level;
if (hdr->use_lf_delta_) {
level += hdr->ref_lf_delta_[0];
if (i4x4) {
level += hdr->mode_lf_delta_[0];
}
}
level = (level < 0) ? 0 : (level > 63) ? 63 : level;
if (level > 0) {
int ilevel = level;
if (hdr->sharpness_ > 0) {
if (hdr->sharpness_ > 4) {
ilevel >>= 2;
} else {
ilevel >>= 1;
}
if (ilevel > 9 - hdr->sharpness_) {
ilevel = 9 - hdr->sharpness_;
}
}
if (ilevel < 1) ilevel = 1;
info->f_ilevel_ = ilevel;
info->f_limit_ = 2 * level + ilevel;
info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
} else {
info->f_limit_ = 0; // no filtering
}
info->f_inner_ = i4x4;
}
}
}
}
//------------------------------------------------------------------------------
// Dithering
// minimal amp that will provide a non-zero dithering effect
#define MIN_DITHER_AMP 4
#define DITHER_AMP_TAB_SIZE 12
static const uint8_t kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
// roughly, it's dqm->uv_mat_[1]
8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
};
void VP8InitDithering(const WebPDecoderOptions* const options,
VP8Decoder* const dec) {
assert(dec != NULL);
if (options != NULL) {
const int d = options->dithering_strength;
const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
if (f > 0) {
int s;
int all_amp = 0;
for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
VP8QuantMatrix* const dqm = &dec->dqm_[s];
if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
}
all_amp |= dqm->dither_;
}
if (all_amp != 0) {
VP8InitRandom(&dec->dithering_rg_, 1.0f);
dec->dither_ = 1;
}
}
// potentially allow alpha dithering
dec->alpha_dithering_ = options->alpha_dithering_strength;
if (dec->alpha_dithering_ > 100) {
dec->alpha_dithering_ = 100;
} else if (dec->alpha_dithering_ < 0) {
dec->alpha_dithering_ = 0;
}
}
}
// Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
uint8_t dither[64];
int i;
for (i = 0; i < 8 * 8; ++i) {
dither[i] = VP8RandomBits2(rg, VP8_DITHER_AMP_BITS + 1, amp);
}
VP8DitherCombine8x8(dither, dst, bps);
}
static void DitherRow(VP8Decoder* const dec) {
int mb_x;
assert(dec->dither_);
for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
const VP8ThreadContext* const ctx = &dec->thread_ctx_;
const VP8MBData* const data = ctx->mb_data_ + mb_x;
const int cache_id = ctx->id_;
const int uv_bps = dec->cache_uv_stride_;
if (data->dither_ >= MIN_DITHER_AMP) {
uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
}
}
}
//------------------------------------------------------------------------------
// This function is called after a row of macroblocks is finished decoding.
// It also takes into account the following restrictions:
// * In case of in-loop filtering, we must hold off sending some of the bottom
// pixels as they are yet unfiltered. They will be when the next macroblock
// row is decoded. Meanwhile, we must preserve them by rotating them in the
// cache area. This doesn't hold for the very bottom row of the uncropped
// picture of course.
// * we must clip the remaining pixels against the cropping area. The VP8Io
// struct must have the following fields set correctly before calling put():
#define MACROBLOCK_VPOS(mb_y) ((mb_y) * 16) // vertical position of a MB
// Finalize and transmit a complete row. Return false in case of user-abort.
static int FinishRow(void* arg1, void* arg2) {
VP8Decoder* const dec = (VP8Decoder*)arg1;
VP8Io* const io = (VP8Io*)arg2;
int ok = 1;
const VP8ThreadContext* const ctx = &dec->thread_ctx_;
const int cache_id = ctx->id_;
const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
const int ysize = extra_y_rows * dec->cache_y_stride_;
const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
const int y_offset = cache_id * 16 * dec->cache_y_stride_;
const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
const int mb_y = ctx->mb_y_;
const int is_first_row = (mb_y == 0);
const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
if (dec->mt_method_ == 2) {
ReconstructRow(dec, ctx);
}
if (ctx->filter_row_) {
FilterRow(dec);
}
if (dec->dither_) {
DitherRow(dec);
}
if (io->put != NULL) {
int y_start = MACROBLOCK_VPOS(mb_y);
int y_end = MACROBLOCK_VPOS(mb_y + 1);
if (!is_first_row) {
y_start -= extra_y_rows;
io->y = ydst;
io->u = udst;
io->v = vdst;
} else {
io->y = dec->cache_y_ + y_offset;
io->u = dec->cache_u_ + uv_offset;
io->v = dec->cache_v_ + uv_offset;
}
if (!is_last_row) {
y_end -= extra_y_rows;
}
if (y_end > io->crop_bottom) {
y_end = io->crop_bottom; // make sure we don't overflow on last row.
}
// If dec->alpha_data_ is not NULL, we have some alpha plane present.
io->a = NULL;
if (dec->alpha_data_ != NULL && y_start < y_end) {
io->a = VP8DecompressAlphaRows(dec, io, y_start, y_end - y_start);
if (io->a == NULL) {
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
"Could not decode alpha data.");
}
}
if (y_start < io->crop_top) {
const int delta_y = io->crop_top - y_start;
y_start = io->crop_top;
assert(!(delta_y & 1));
io->y += dec->cache_y_stride_ * delta_y;
io->u += dec->cache_uv_stride_ * (delta_y >> 1);
io->v += dec->cache_uv_stride_ * (delta_y >> 1);
if (io->a != NULL) {
io->a += io->width * delta_y;
}
}
if (y_start < y_end) {
io->y += io->crop_left;
io->u += io->crop_left >> 1;
io->v += io->crop_left >> 1;
if (io->a != NULL) {
io->a += io->crop_left;
}
io->mb_y = y_start - io->crop_top;
io->mb_w = io->crop_right - io->crop_left;
io->mb_h = y_end - y_start;
ok = io->put(io);
}
}
// rotate top samples if needed
if (cache_id + 1 == dec->num_caches_) {
if (!is_last_row) {
memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
}
}
return ok;
}
#undef MACROBLOCK_VPOS
//------------------------------------------------------------------------------
int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
int ok = 1;
VP8ThreadContext* const ctx = &dec->thread_ctx_;
const int filter_row =
(dec->filter_type_ > 0) &&
(dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
if (dec->mt_method_ == 0) {
// ctx->id_ and ctx->f_info_ are already set
ctx->mb_y_ = dec->mb_y_;
ctx->filter_row_ = filter_row;
ReconstructRow(dec, ctx);
ok = FinishRow(dec, io);
} else {
WebPWorker* const worker = &dec->worker_;
// Finish previous job *before* updating context
ok &= WebPGetWorkerInterface()->Sync(worker);
assert(worker->status_ == OK);
if (ok) { // spawn a new deblocking/output job
ctx->io_ = *io;
ctx->id_ = dec->cache_id_;
ctx->mb_y_ = dec->mb_y_;
ctx->filter_row_ = filter_row;
if (dec->mt_method_ == 2) { // swap macroblock data
VP8MBData* const tmp = ctx->mb_data_;
ctx->mb_data_ = dec->mb_data_;
dec->mb_data_ = tmp;
} else {
// perform reconstruction directly in main thread
ReconstructRow(dec, ctx);
}
if (filter_row) { // swap filter info
VP8FInfo* const tmp = ctx->f_info_;
ctx->f_info_ = dec->f_info_;
dec->f_info_ = tmp;
}
// (reconstruct)+filter in parallel
WebPGetWorkerInterface()->Launch(worker);
if (++dec->cache_id_ == dec->num_caches_) {
dec->cache_id_ = 0;
}
}
}
return ok;
}
//------------------------------------------------------------------------------
// Finish setting up the decoding parameter once user's setup() is called.
VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
// Call setup() first. This may trigger additional decoding features on 'io'.
// Note: Afterward, we must call teardown() no matter what.
if (io->setup != NULL && !io->setup(io)) {
VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
return dec->status_;
}
// Disable filtering per user request
if (io->bypass_filtering) {
dec->filter_type_ = 0;
}
// Define the area where we can skip in-loop filtering, in case of cropping.
//
// 'Simple' filter reads two luma samples outside of the macroblock
// and filters one. It doesn't filter the chroma samples. Hence, we can
// avoid doing the in-loop filtering before crop_top/crop_left position.
// For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
// Means: there's a dependency chain that goes all the way up to the
// top-left corner of the picture (MB #0). We must filter all the previous
// macroblocks.
{
const int extra_pixels = kFilterExtraRows[dec->filter_type_];
if (dec->filter_type_ == 2) {
// For complex filter, we need to preserve the dependency chain.
dec->tl_mb_x_ = 0;
dec->tl_mb_y_ = 0;
} else {
// For simple filter, we can filter only the cropped region.
// We include 'extra_pixels' on the other side of the boundary, since
// vertical or horizontal filtering of the previous macroblock can
// modify some abutting pixels.
dec->tl_mb_x_ = (io->crop_left - extra_pixels) >> 4;
dec->tl_mb_y_ = (io->crop_top - extra_pixels) >> 4;
if (dec->tl_mb_x_ < 0) dec->tl_mb_x_ = 0;
if (dec->tl_mb_y_ < 0) dec->tl_mb_y_ = 0;
}
// We need some 'extra' pixels on the right/bottom.
dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4;
dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4;
if (dec->br_mb_x_ > dec->mb_w_) {
dec->br_mb_x_ = dec->mb_w_;
}
if (dec->br_mb_y_ > dec->mb_h_) {
dec->br_mb_y_ = dec->mb_h_;
}
}
PrecomputeFilterStrengths(dec);
return VP8_STATUS_OK;
}
int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
int ok = 1;
if (dec->mt_method_ > 0) {
ok = WebPGetWorkerInterface()->Sync(&dec->worker_);
}
if (io->teardown != NULL) {
io->teardown(io);
}
return ok;
}
//------------------------------------------------------------------------------
// For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line.
//
// Reason is: the deblocking filter cannot deblock the bottom horizontal edges
// immediately, and needs to wait for first few rows of the next macroblock to
// be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending
// on strength).
// With two threads, the vertical positions of the rows being decoded are:
// Decode: [ 0..15][16..31][32..47][48..63][64..79][...
// Deblock: [ 0..11][12..27][28..43][44..59][...
// If we use two threads and two caches of 16 pixels, the sequence would be:
// Decode: [ 0..15][16..31][ 0..15!!][16..31][ 0..15][...
// Deblock: [ 0..11][12..27!!][-4..11][12..27][...
// The problem occurs during row [12..15!!] that both the decoding and
// deblocking threads are writing simultaneously.
// With 3 cache lines, one get a safe write pattern:
// Decode: [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0..
// Deblock: [ 0..11][12..27][28..43][-4..11][12..27][28...
// Note that multi-threaded output _without_ deblocking can make use of two
// cache lines of 16 pixels only, since there's no lagging behind. The decoding
// and output process have non-concurrent writing:
// Decode: [ 0..15][16..31][ 0..15][16..31][...
// io->put: [ 0..15][16..31][ 0..15][...
#define MT_CACHE_LINES 3
#define ST_CACHE_LINES 1 // 1 cache row only for single-threaded case
// Initialize multi/single-thread worker
static int InitThreadContext(VP8Decoder* const dec) {
dec->cache_id_ = 0;
if (dec->mt_method_ > 0) {
WebPWorker* const worker = &dec->worker_;
if (!WebPGetWorkerInterface()->Reset(worker)) {
return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
"thread initialization failed.");
}
worker->data1 = dec;
worker->data2 = (void*)&dec->thread_ctx_.io_;
worker->hook = FinishRow;
dec->num_caches_ =
(dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
} else {
dec->num_caches_ = ST_CACHE_LINES;
}
return 1;
}
int VP8GetThreadMethod(const WebPDecoderOptions* const options,
const WebPHeaderStructure* const headers,
int width, int height) {
if (options == NULL || options->use_threads == 0) {
return 0;
}
(void)headers;
(void)width;
(void)height;
assert(headers == NULL || !headers->is_lossless);
#if defined(WEBP_USE_THREAD)
if (width >= MIN_WIDTH_FOR_THREADS) return 2;
#endif
return 0;
}
#undef MT_CACHE_LINES
#undef ST_CACHE_LINES
//------------------------------------------------------------------------------
// Memory setup
static int AllocateMemory(VP8Decoder* const dec) {
const int num_caches = dec->num_caches_;
const int mb_w = dec->mb_w_;
// Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
const size_t top_size = sizeof(VP8TopSamples) * mb_w;
const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
const size_t f_info_size =
(dec->filter_type_ > 0) ?
mb_w * (dec->mt_method_ > 0 ? 2 : 1) * sizeof(VP8FInfo)
: 0;
const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
const size_t mb_data_size =
(dec->mt_method_ == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
const size_t cache_height = (16 * num_caches
+ kFilterExtraRows[dec->filter_type_]) * 3 / 2;
const size_t cache_size = top_size * cache_height;
// alpha_size is the only one that scales as width x height.
const uint64_t alpha_size = (dec->alpha_data_ != NULL) ?
(uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
const uint64_t needed = (uint64_t)intra_pred_mode_size
+ top_size + mb_info_size + f_info_size
+ yuv_size + mb_data_size
+ cache_size + alpha_size + WEBP_ALIGN_CST;
uint8_t* mem;
if (needed != (size_t)needed) return 0; // check for overflow
if (needed > dec->mem_size_) {
WebPSafeFree(dec->mem_);
dec->mem_size_ = 0;
dec->mem_ = WebPSafeMalloc(needed, sizeof(uint8_t));
if (dec->mem_ == NULL) {
return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
"no memory during frame initialization.");
}
// down-cast is ok, thanks to WebPSafeMalloc() above.
dec->mem_size_ = (size_t)needed;
}
mem = (uint8_t*)dec->mem_;
dec->intra_t_ = mem;
mem += intra_pred_mode_size;
dec->yuv_t_ = (VP8TopSamples*)mem;
mem += top_size;
dec->mb_info_ = ((VP8MB*)mem) + 1;
mem += mb_info_size;
dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL;
mem += f_info_size;
dec->thread_ctx_.id_ = 0;
dec->thread_ctx_.f_info_ = dec->f_info_;
if (dec->filter_type_ > 0 && dec->mt_method_ > 0) {
// secondary cache line. The deblocking process need to make use of the
// filtering strength from previous macroblock row, while the new ones
// are being decoded in parallel. We'll just swap the pointers.
dec->thread_ctx_.f_info_ += mb_w;
}
mem = (uint8_t*)WEBP_ALIGN(mem);
assert((yuv_size & WEBP_ALIGN_CST) == 0);
dec->yuv_b_ = mem;
mem += yuv_size;
dec->mb_data_ = (VP8MBData*)mem;
dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
if (dec->mt_method_ == 2) {
dec->thread_ctx_.mb_data_ += mb_w;
}
mem += mb_data_size;
dec->cache_y_stride_ = 16 * mb_w;
dec->cache_uv_stride_ = 8 * mb_w;
{
const int extra_rows = kFilterExtraRows[dec->filter_type_];
const int extra_y = extra_rows * dec->cache_y_stride_;
const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_;
dec->cache_y_ = mem + extra_y;
dec->cache_u_ = dec->cache_y_
+ 16 * num_caches * dec->cache_y_stride_ + extra_uv;
dec->cache_v_ = dec->cache_u_
+ 8 * num_caches * dec->cache_uv_stride_ + extra_uv;
dec->cache_id_ = 0;
}
mem += cache_size;
// alpha plane
dec->alpha_plane_ = alpha_size ? mem : NULL;
mem += alpha_size;
assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
// note: left/top-info is initialized once for all.
memset(dec->mb_info_ - 1, 0, mb_info_size);
VP8InitScanline(dec); // initialize left too.
// initialize top
memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
return 1;
}
static void InitIo(VP8Decoder* const dec, VP8Io* io) {
// prepare 'io'
io->mb_y = 0;
io->y = dec->cache_y_;
io->u = dec->cache_u_;
io->v = dec->cache_v_;
io->y_stride = dec->cache_y_stride_;
io->uv_stride = dec->cache_uv_stride_;
io->a = NULL;
}
int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io) {
if (!InitThreadContext(dec)) return 0; // call first. Sets dec->num_caches_.
if (!AllocateMemory(dec)) return 0;
InitIo(dec, io);
VP8DspInit(); // Init critical function pointers and look-up tables.
return 1;
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,908 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Incremental decoding
//
// Author: somnath@google.com (Somnath Banerjee)
#include <assert.h>
#include <string.h>
#include <stdlib.h>
#include "dec_alphai_dec.h"
#include "dec_webpi_dec.h"
#include "dec_vp8i_dec.h"
#include "utils_utils.h"
// In append mode, buffer allocations increase as multiples of this value.
// Needs to be a power of 2.
#define CHUNK_SIZE 4096
#define MAX_MB_SIZE 4096
//------------------------------------------------------------------------------
// Data structures for memory and states
// Decoding states. State normally flows as:
// WEBP_HEADER->VP8_HEADER->VP8_PARTS0->VP8_DATA->DONE for a lossy image, and
// WEBP_HEADER->VP8L_HEADER->VP8L_DATA->DONE for a lossless image.
// If there is any error the decoder goes into state ERROR.
typedef enum {
STATE_WEBP_HEADER, // All the data before that of the VP8/VP8L chunk.
STATE_VP8_HEADER, // The VP8 Frame header (within the VP8 chunk).
STATE_VP8_PARTS0,
STATE_VP8_DATA,
STATE_VP8L_HEADER,
STATE_VP8L_DATA,
STATE_DONE,
STATE_ERROR
} DecState;
// Operating state for the MemBuffer
typedef enum {
MEM_MODE_NONE = 0,
MEM_MODE_APPEND,
MEM_MODE_MAP
} MemBufferMode;
// storage for partition #0 and partial data (in a rolling fashion)
typedef struct {
MemBufferMode mode_; // Operation mode
size_t start_; // start location of the data to be decoded
size_t end_; // end location
size_t buf_size_; // size of the allocated buffer
uint8_t* buf_; // We don't own this buffer in case WebPIUpdate()
size_t part0_size_; // size of partition #0
const uint8_t* part0_buf_; // buffer to store partition #0
} MemBuffer;
struct WebPIDecoder {
DecState state_; // current decoding state
WebPDecParams params_; // Params to store output info
int is_lossless_; // for down-casting 'dec_'.
void* dec_; // either a VP8Decoder or a VP8LDecoder instance
VP8Io io_;
MemBuffer mem_; // input memory buffer.
WebPDecBuffer output_; // output buffer (when no external one is supplied,
// or if the external one has slow-memory)
WebPDecBuffer* final_output_; // Slow-memory output to copy to eventually.
size_t chunk_size_; // Compressed VP8/VP8L size extracted from Header.
int last_mb_y_; // last row reached for intra-mode decoding
};
// MB context to restore in case VP8DecodeMB() fails
typedef struct {
VP8MB left_;
VP8MB info_;
VP8BitReader token_br_;
} MBContext;
//------------------------------------------------------------------------------
// MemBuffer: incoming data handling
static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
return (mem->end_ - mem->start_);
}
// Check if we need to preserve the compressed alpha data, as it may not have
// been decoded yet.
static int NeedCompressedAlpha(const WebPIDecoder* const idec) {
if (idec->state_ == STATE_WEBP_HEADER) {
// We haven't parsed the headers yet, so we don't know whether the image is
// lossy or lossless. This also means that we haven't parsed the ALPH chunk.
return 0;
}
if (idec->is_lossless_) {
return 0; // ALPH chunk is not present for lossless images.
} else {
const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
assert(dec != NULL); // Must be true as idec->state_ != STATE_WEBP_HEADER.
return (dec->alpha_data_ != NULL) && !dec->is_alpha_decoded_;
}
}
static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
MemBuffer* const mem = &idec->mem_;
const uint8_t* const new_base = mem->buf_ + mem->start_;
// note: for VP8, setting up idec->io_ is only really needed at the beginning
// of the decoding, till partition #0 is complete.
idec->io_.data = new_base;
idec->io_.data_size = MemDataSize(mem);
if (idec->dec_ != NULL) {
if (!idec->is_lossless_) {
VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
const uint32_t last_part = dec->num_parts_minus_one_;
if (offset != 0) {
uint32_t p;
for (p = 0; p <= last_part; ++p) {
VP8RemapBitReader(dec->parts_ + p, offset);
}
// Remap partition #0 data pointer to new offset, but only in MAP
// mode (in APPEND mode, partition #0 is copied into a fixed memory).
if (mem->mode_ == MEM_MODE_MAP) {
VP8RemapBitReader(&dec->br_, offset);
}
}
{
const uint8_t* const last_start = dec->parts_[last_part].buf_;
VP8BitReaderSetBuffer(&dec->parts_[last_part], last_start,
mem->buf_ + mem->end_ - last_start);
}
if (NeedCompressedAlpha(idec)) {
ALPHDecoder* const alph_dec = dec->alph_dec_;
dec->alpha_data_ += offset;
if (alph_dec != NULL && alph_dec->vp8l_dec_ != NULL) {
if (alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION) {
VP8LDecoder* const alph_vp8l_dec = alph_dec->vp8l_dec_;
assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN);
VP8LBitReaderSetBuffer(&alph_vp8l_dec->br_,
dec->alpha_data_ + ALPHA_HEADER_LEN,
dec->alpha_data_size_ - ALPHA_HEADER_LEN);
} else { // alph_dec->method_ == ALPHA_NO_COMPRESSION
// Nothing special to do in this case.
}
}
}
} else { // Resize lossless bitreader
VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
VP8LBitReaderSetBuffer(&dec->br_, new_base, MemDataSize(mem));
}
}
}
// Appends data to the end of MemBuffer->buf_. It expands the allocated memory
// size if required and also updates VP8BitReader's if new memory is allocated.
static int AppendToMemBuffer(WebPIDecoder* const idec,
const uint8_t* const data, size_t data_size) {
VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
MemBuffer* const mem = &idec->mem_;
const int need_compressed_alpha = NeedCompressedAlpha(idec);
const uint8_t* const old_start =
(mem->buf_ == NULL) ? NULL : mem->buf_ + mem->start_;
const uint8_t* const old_base =
need_compressed_alpha ? dec->alpha_data_ : old_start;
assert(mem->buf_ != NULL || mem->start_ == 0);
assert(mem->mode_ == MEM_MODE_APPEND);
if (data_size > MAX_CHUNK_PAYLOAD) {
// security safeguard: trying to allocate more than what the format
// allows for a chunk should be considered a smoke smell.
return 0;
}
if (mem->end_ + data_size > mem->buf_size_) { // Need some free memory
const size_t new_mem_start = old_start - old_base;
const size_t current_size = MemDataSize(mem) + new_mem_start;
const uint64_t new_size = (uint64_t)current_size + data_size;
const uint64_t extra_size = (new_size + CHUNK_SIZE - 1) & ~(CHUNK_SIZE - 1);
uint8_t* const new_buf =
(uint8_t*)WebPSafeMalloc(extra_size, sizeof(*new_buf));
if (new_buf == NULL) return 0;
if (old_base != NULL) memcpy(new_buf, old_base, current_size);
WebPSafeFree(mem->buf_);
mem->buf_ = new_buf;
mem->buf_size_ = (size_t)extra_size;
mem->start_ = new_mem_start;
mem->end_ = current_size;
}
assert(mem->buf_ != NULL);
memcpy(mem->buf_ + mem->end_, data, data_size);
mem->end_ += data_size;
assert(mem->end_ <= mem->buf_size_);
DoRemap(idec, mem->buf_ + mem->start_ - old_start);
return 1;
}
static int RemapMemBuffer(WebPIDecoder* const idec,
const uint8_t* const data, size_t data_size) {
MemBuffer* const mem = &idec->mem_;
const uint8_t* const old_buf = mem->buf_;
const uint8_t* const old_start =
(old_buf == NULL) ? NULL : old_buf + mem->start_;
assert(old_buf != NULL || mem->start_ == 0);
assert(mem->mode_ == MEM_MODE_MAP);
if (data_size < mem->buf_size_) return 0; // can't remap to a shorter buffer!
mem->buf_ = (uint8_t*)data;
mem->end_ = mem->buf_size_ = data_size;
DoRemap(idec, mem->buf_ + mem->start_ - old_start);
return 1;
}
static void InitMemBuffer(MemBuffer* const mem) {
mem->mode_ = MEM_MODE_NONE;
mem->buf_ = NULL;
mem->buf_size_ = 0;
mem->part0_buf_ = NULL;
mem->part0_size_ = 0;
}
static void ClearMemBuffer(MemBuffer* const mem) {
assert(mem);
if (mem->mode_ == MEM_MODE_APPEND) {
WebPSafeFree(mem->buf_);
WebPSafeFree((void*)mem->part0_buf_);
}
}
static int CheckMemBufferMode(MemBuffer* const mem, MemBufferMode expected) {
if (mem->mode_ == MEM_MODE_NONE) {
mem->mode_ = expected; // switch to the expected mode
} else if (mem->mode_ != expected) {
return 0; // we mixed the modes => error
}
assert(mem->mode_ == expected); // mode is ok
return 1;
}
// To be called last.
static VP8StatusCode FinishDecoding(WebPIDecoder* const idec) {
const WebPDecoderOptions* const options = idec->params_.options;
WebPDecBuffer* const output = idec->params_.output;
idec->state_ = STATE_DONE;
if (options != NULL && options->flip) {
const VP8StatusCode status = WebPFlipBuffer(output);
if (status != VP8_STATUS_OK) return status;
}
if (idec->final_output_ != NULL) {
WebPCopyDecBufferPixels(output, idec->final_output_); // do the slow-copy
WebPFreeDecBuffer(&idec->output_);
*output = *idec->final_output_;
idec->final_output_ = NULL;
}
return VP8_STATUS_OK;
}
//------------------------------------------------------------------------------
// Macroblock-decoding contexts
static void SaveContext(const VP8Decoder* dec, const VP8BitReader* token_br,
MBContext* const context) {
context->left_ = dec->mb_info_[-1];
context->info_ = dec->mb_info_[dec->mb_x_];
context->token_br_ = *token_br;
}
static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
VP8BitReader* const token_br) {
dec->mb_info_[-1] = context->left_;
dec->mb_info_[dec->mb_x_] = context->info_;
*token_br = context->token_br_;
}
//------------------------------------------------------------------------------
static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
if (idec->state_ == STATE_VP8_DATA) {
// Synchronize the thread, clean-up and check for errors.
VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
}
idec->state_ = STATE_ERROR;
return error;
}
static void ChangeState(WebPIDecoder* const idec, DecState new_state,
size_t consumed_bytes) {
MemBuffer* const mem = &idec->mem_;
idec->state_ = new_state;
mem->start_ += consumed_bytes;
assert(mem->start_ <= mem->end_);
idec->io_.data = mem->buf_ + mem->start_;
idec->io_.data_size = MemDataSize(mem);
}
// Headers
static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
MemBuffer* const mem = &idec->mem_;
const uint8_t* data = mem->buf_ + mem->start_;
size_t curr_size = MemDataSize(mem);
VP8StatusCode status;
WebPHeaderStructure headers;
headers.data = data;
headers.data_size = curr_size;
headers.have_all_data = 0;
status = WebPParseHeaders(&headers);
if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
return VP8_STATUS_SUSPENDED; // We haven't found a VP8 chunk yet.
} else if (status != VP8_STATUS_OK) {
return IDecError(idec, status);
}
idec->chunk_size_ = headers.compressed_size;
idec->is_lossless_ = headers.is_lossless;
if (!idec->is_lossless_) {
VP8Decoder* const dec = VP8New();
if (dec == NULL) {
return VP8_STATUS_OUT_OF_MEMORY;
}
idec->dec_ = dec;
dec->alpha_data_ = headers.alpha_data;
dec->alpha_data_size_ = headers.alpha_data_size;
ChangeState(idec, STATE_VP8_HEADER, headers.offset);
} else {
VP8LDecoder* const dec = VP8LNew();
if (dec == NULL) {
return VP8_STATUS_OUT_OF_MEMORY;
}
idec->dec_ = dec;
ChangeState(idec, STATE_VP8L_HEADER, headers.offset);
}
return VP8_STATUS_OK;
}
static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
const size_t curr_size = MemDataSize(&idec->mem_);
int width, height;
uint32_t bits;
if (curr_size < VP8_FRAME_HEADER_SIZE) {
// Not enough data bytes to extract VP8 Frame Header.
return VP8_STATUS_SUSPENDED;
}
if (!VP8GetInfo(data, curr_size, idec->chunk_size_, &width, &height)) {
return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
}
bits = data[0] | (data[1] << 8) | (data[2] << 16);
idec->mem_.part0_size_ = (bits >> 5) + VP8_FRAME_HEADER_SIZE;
idec->io_.data = data;
idec->io_.data_size = curr_size;
idec->state_ = STATE_VP8_PARTS0;
return VP8_STATUS_OK;
}
// Partition #0
static VP8StatusCode CopyParts0Data(WebPIDecoder* const idec) {
VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
VP8BitReader* const br = &dec->br_;
const size_t part_size = br->buf_end_ - br->buf_;
MemBuffer* const mem = &idec->mem_;
assert(!idec->is_lossless_);
assert(mem->part0_buf_ == NULL);
// the following is a format limitation, no need for runtime check:
assert(part_size <= mem->part0_size_);
if (part_size == 0) { // can't have zero-size partition #0
return VP8_STATUS_BITSTREAM_ERROR;
}
if (mem->mode_ == MEM_MODE_APPEND) {
// We copy and grab ownership of the partition #0 data.
uint8_t* const part0_buf = (uint8_t*)WebPSafeMalloc(1ULL, part_size);
if (part0_buf == NULL) {
return VP8_STATUS_OUT_OF_MEMORY;
}
memcpy(part0_buf, br->buf_, part_size);
mem->part0_buf_ = part0_buf;
VP8BitReaderSetBuffer(br, part0_buf, part_size);
} else {
// Else: just keep pointers to the partition #0's data in dec_->br_.
}
mem->start_ += part_size;
return VP8_STATUS_OK;
}
static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
VP8Io* const io = &idec->io_;
const WebPDecParams* const params = &idec->params_;
WebPDecBuffer* const output = params->output;
// Wait till we have enough data for the whole partition #0
if (MemDataSize(&idec->mem_) < idec->mem_.part0_size_) {
return VP8_STATUS_SUSPENDED;
}
if (!VP8GetHeaders(dec, io)) {
const VP8StatusCode status = dec->status_;
if (status == VP8_STATUS_SUSPENDED ||
status == VP8_STATUS_NOT_ENOUGH_DATA) {
// treating NOT_ENOUGH_DATA as SUSPENDED state
return VP8_STATUS_SUSPENDED;
}
return IDecError(idec, status);
}
// Allocate/Verify output buffer now
dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options,
output);
if (dec->status_ != VP8_STATUS_OK) {
return IDecError(idec, dec->status_);
}
// This change must be done before calling VP8InitFrame()
dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
io->width, io->height);
VP8InitDithering(params->options, dec);
dec->status_ = CopyParts0Data(idec);
if (dec->status_ != VP8_STATUS_OK) {
return IDecError(idec, dec->status_);
}
// Finish setting up the decoding parameters. Will call io->setup().
if (VP8EnterCritical(dec, io) != VP8_STATUS_OK) {
return IDecError(idec, dec->status_);
}
// Note: past this point, teardown() must always be called
// in case of error.
idec->state_ = STATE_VP8_DATA;
// Allocate memory and prepare everything.
if (!VP8InitFrame(dec, io)) {
return IDecError(idec, dec->status_);
}
return VP8_STATUS_OK;
}
// Remaining partitions
static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
VP8Io* const io = &idec->io_;
// Make sure partition #0 has been read before, to set dec to ready_.
if (!dec->ready_) {
return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
}
for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
if (idec->last_mb_y_ != dec->mb_y_) {
if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
// note: normally, error shouldn't occur since we already have the whole
// partition0 available here in DecodeRemaining(). Reaching EOF while
// reading intra modes really means a BITSTREAM_ERROR.
return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
}
idec->last_mb_y_ = dec->mb_y_;
}
for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
VP8BitReader* const token_br =
&dec->parts_[dec->mb_y_ & dec->num_parts_minus_one_];
MBContext context;
SaveContext(dec, token_br, &context);
if (!VP8DecodeMB(dec, token_br)) {
// We shouldn't fail when MAX_MB data was available
if (dec->num_parts_minus_one_ == 0 &&
MemDataSize(&idec->mem_) > MAX_MB_SIZE) {
return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
}
// Synchronize the threads.
if (dec->mt_method_ > 0) {
if (!WebPGetWorkerInterface()->Sync(&dec->worker_)) {
return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
}
}
RestoreContext(&context, dec, token_br);
return VP8_STATUS_SUSPENDED;
}
// Release buffer only if there is only one partition
if (dec->num_parts_minus_one_ == 0) {
idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_;
assert(idec->mem_.start_ <= idec->mem_.end_);
}
}
VP8InitScanline(dec); // Prepare for next scanline
// Reconstruct, filter and emit the row.
if (!VP8ProcessRow(dec, io)) {
return IDecError(idec, VP8_STATUS_USER_ABORT);
}
}
// Synchronize the thread and check for errors.
if (!VP8ExitCritical(dec, io)) {
idec->state_ = STATE_ERROR; // prevent re-entry in IDecError
return IDecError(idec, VP8_STATUS_USER_ABORT);
}
dec->ready_ = 0;
return FinishDecoding(idec);
}
static VP8StatusCode ErrorStatusLossless(WebPIDecoder* const idec,
VP8StatusCode status) {
if (status == VP8_STATUS_SUSPENDED || status == VP8_STATUS_NOT_ENOUGH_DATA) {
return VP8_STATUS_SUSPENDED;
}
return IDecError(idec, status);
}
static VP8StatusCode DecodeVP8LHeader(WebPIDecoder* const idec) {
VP8Io* const io = &idec->io_;
VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
const WebPDecParams* const params = &idec->params_;
WebPDecBuffer* const output = params->output;
size_t curr_size = MemDataSize(&idec->mem_);
assert(idec->is_lossless_);
// Wait until there's enough data for decoding header.
if (curr_size < (idec->chunk_size_ >> 3)) {
dec->status_ = VP8_STATUS_SUSPENDED;
return ErrorStatusLossless(idec, dec->status_);
}
if (!VP8LDecodeHeader(dec, io)) {
if (dec->status_ == VP8_STATUS_BITSTREAM_ERROR &&
curr_size < idec->chunk_size_) {
dec->status_ = VP8_STATUS_SUSPENDED;
}
return ErrorStatusLossless(idec, dec->status_);
}
// Allocate/verify output buffer now.
dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options,
output);
if (dec->status_ != VP8_STATUS_OK) {
return IDecError(idec, dec->status_);
}
idec->state_ = STATE_VP8L_DATA;
return VP8_STATUS_OK;
}
static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
const size_t curr_size = MemDataSize(&idec->mem_);
assert(idec->is_lossless_);
// Switch to incremental decoding if we don't have all the bytes available.
dec->incremental_ = (curr_size < idec->chunk_size_);
if (!VP8LDecodeImage(dec)) {
return ErrorStatusLossless(idec, dec->status_);
}
assert(dec->status_ == VP8_STATUS_OK || dec->status_ == VP8_STATUS_SUSPENDED);
return (dec->status_ == VP8_STATUS_SUSPENDED) ? dec->status_
: FinishDecoding(idec);
}
// Main decoding loop
static VP8StatusCode IDecode(WebPIDecoder* idec) {
VP8StatusCode status = VP8_STATUS_SUSPENDED;
if (idec->state_ == STATE_WEBP_HEADER) {
status = DecodeWebPHeaders(idec);
} else {
if (idec->dec_ == NULL) {
return VP8_STATUS_SUSPENDED; // can't continue if we have no decoder.
}
}
if (idec->state_ == STATE_VP8_HEADER) {
status = DecodeVP8FrameHeader(idec);
}
if (idec->state_ == STATE_VP8_PARTS0) {
status = DecodePartition0(idec);
}
if (idec->state_ == STATE_VP8_DATA) {
const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
if (dec == NULL) {
return VP8_STATUS_SUSPENDED; // can't continue if we have no decoder.
}
status = DecodeRemaining(idec);
}
if (idec->state_ == STATE_VP8L_HEADER) {
status = DecodeVP8LHeader(idec);
}
if (idec->state_ == STATE_VP8L_DATA) {
status = DecodeVP8LData(idec);
}
return status;
}
//------------------------------------------------------------------------------
// Internal constructor
static WebPIDecoder* NewDecoder(WebPDecBuffer* const output_buffer,
const WebPBitstreamFeatures* const features) {
WebPIDecoder* idec = (WebPIDecoder*)WebPSafeCalloc(1ULL, sizeof(*idec));
if (idec == NULL) {
return NULL;
}
idec->state_ = STATE_WEBP_HEADER;
idec->chunk_size_ = 0;
idec->last_mb_y_ = -1;
InitMemBuffer(&idec->mem_);
WebPInitDecBuffer(&idec->output_);
VP8InitIo(&idec->io_);
WebPResetDecParams(&idec->params_);
if (output_buffer == NULL || WebPAvoidSlowMemory(output_buffer, features)) {
idec->params_.output = &idec->output_;
idec->final_output_ = output_buffer;
if (output_buffer != NULL) {
idec->params_.output->colorspace = output_buffer->colorspace;
}
} else {
idec->params_.output = output_buffer;
idec->final_output_ = NULL;
}
WebPInitCustomIo(&idec->params_, &idec->io_); // Plug the I/O functions.
return idec;
}
//------------------------------------------------------------------------------
// Public functions
WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
return NewDecoder(output_buffer, NULL);
}
WebPIDecoder* WebPIDecode(const uint8_t* data, size_t data_size,
WebPDecoderConfig* config) {
WebPIDecoder* idec;
WebPBitstreamFeatures tmp_features;
WebPBitstreamFeatures* const features =
(config == NULL) ? &tmp_features : &config->input;
memset(&tmp_features, 0, sizeof(tmp_features));
// Parse the bitstream's features, if requested:
if (data != NULL && data_size > 0) {
if (WebPGetFeatures(data, data_size, features) != VP8_STATUS_OK) {
return NULL;
}
}
// Create an instance of the incremental decoder
idec = (config != NULL) ? NewDecoder(&config->output, features)
: NewDecoder(NULL, features);
if (idec == NULL) {
return NULL;
}
// Finish initialization
if (config != NULL) {
idec->params_.options = &config->options;
}
return idec;
}
void WebPIDelete(WebPIDecoder* idec) {
if (idec == NULL) return;
if (idec->dec_ != NULL) {
if (!idec->is_lossless_) {
if (idec->state_ == STATE_VP8_DATA) {
// Synchronize the thread, clean-up and check for errors.
VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
}
VP8Delete((VP8Decoder*)idec->dec_);
} else {
VP8LDelete((VP8LDecoder*)idec->dec_);
}
}
ClearMemBuffer(&idec->mem_);
WebPFreeDecBuffer(&idec->output_);
WebPSafeFree(idec);
}
//------------------------------------------------------------------------------
// Wrapper toward WebPINewDecoder
WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE csp, uint8_t* output_buffer,
size_t output_buffer_size, int output_stride) {
const int is_external_memory = (output_buffer != NULL) ? 1 : 0;
WebPIDecoder* idec;
if (csp >= MODE_YUV) return NULL;
if (is_external_memory == 0) { // Overwrite parameters to sane values.
output_buffer_size = 0;
output_stride = 0;
} else { // A buffer was passed. Validate the other params.
if (output_stride == 0 || output_buffer_size == 0) {
return NULL; // invalid parameter.
}
}
idec = WebPINewDecoder(NULL);
if (idec == NULL) return NULL;
idec->output_.colorspace = csp;
idec->output_.is_external_memory = is_external_memory;
idec->output_.u.RGBA.rgba = output_buffer;
idec->output_.u.RGBA.stride = output_stride;
idec->output_.u.RGBA.size = output_buffer_size;
return idec;
}
WebPIDecoder* WebPINewYUVA(uint8_t* luma, size_t luma_size, int luma_stride,
uint8_t* u, size_t u_size, int u_stride,
uint8_t* v, size_t v_size, int v_stride,
uint8_t* a, size_t a_size, int a_stride) {
const int is_external_memory = (luma != NULL) ? 1 : 0;
WebPIDecoder* idec;
WEBP_CSP_MODE colorspace;
if (is_external_memory == 0) { // Overwrite parameters to sane values.
luma_size = u_size = v_size = a_size = 0;
luma_stride = u_stride = v_stride = a_stride = 0;
u = v = a = NULL;
colorspace = MODE_YUVA;
} else { // A luma buffer was passed. Validate the other parameters.
if (u == NULL || v == NULL) return NULL;
if (luma_size == 0 || u_size == 0 || v_size == 0) return NULL;
if (luma_stride == 0 || u_stride == 0 || v_stride == 0) return NULL;
if (a != NULL) {
if (a_size == 0 || a_stride == 0) return NULL;
}
colorspace = (a == NULL) ? MODE_YUV : MODE_YUVA;
}
idec = WebPINewDecoder(NULL);
if (idec == NULL) return NULL;
idec->output_.colorspace = colorspace;
idec->output_.is_external_memory = is_external_memory;
idec->output_.u.YUVA.y = luma;
idec->output_.u.YUVA.y_stride = luma_stride;
idec->output_.u.YUVA.y_size = luma_size;
idec->output_.u.YUVA.u = u;
idec->output_.u.YUVA.u_stride = u_stride;
idec->output_.u.YUVA.u_size = u_size;
idec->output_.u.YUVA.v = v;
idec->output_.u.YUVA.v_stride = v_stride;
idec->output_.u.YUVA.v_size = v_size;
idec->output_.u.YUVA.a = a;
idec->output_.u.YUVA.a_stride = a_stride;
idec->output_.u.YUVA.a_size = a_size;
return idec;
}
WebPIDecoder* WebPINewYUV(uint8_t* luma, size_t luma_size, int luma_stride,
uint8_t* u, size_t u_size, int u_stride,
uint8_t* v, size_t v_size, int v_stride) {
return WebPINewYUVA(luma, luma_size, luma_stride,
u, u_size, u_stride,
v, v_size, v_stride,
NULL, 0, 0);
}
//------------------------------------------------------------------------------
static VP8StatusCode IDecCheckStatus(const WebPIDecoder* const idec) {
assert(idec);
if (idec->state_ == STATE_ERROR) {
return VP8_STATUS_BITSTREAM_ERROR;
}
if (idec->state_ == STATE_DONE) {
return VP8_STATUS_OK;
}
return VP8_STATUS_SUSPENDED;
}
VP8StatusCode WebPIAppend(WebPIDecoder* idec,
const uint8_t* data, size_t data_size) {
VP8StatusCode status;
if (idec == NULL || data == NULL) {
return VP8_STATUS_INVALID_PARAM;
}
status = IDecCheckStatus(idec);
if (status != VP8_STATUS_SUSPENDED) {
return status;
}
// Check mixed calls between RemapMemBuffer and AppendToMemBuffer.
if (!CheckMemBufferMode(&idec->mem_, MEM_MODE_APPEND)) {
return VP8_STATUS_INVALID_PARAM;
}
// Append data to memory buffer
if (!AppendToMemBuffer(idec, data, data_size)) {
return VP8_STATUS_OUT_OF_MEMORY;
}
return IDecode(idec);
}
VP8StatusCode WebPIUpdate(WebPIDecoder* idec,
const uint8_t* data, size_t data_size) {
VP8StatusCode status;
if (idec == NULL || data == NULL) {
return VP8_STATUS_INVALID_PARAM;
}
status = IDecCheckStatus(idec);
if (status != VP8_STATUS_SUSPENDED) {
return status;
}
// Check mixed calls between RemapMemBuffer and AppendToMemBuffer.
if (!CheckMemBufferMode(&idec->mem_, MEM_MODE_MAP)) {
return VP8_STATUS_INVALID_PARAM;
}
// Make the memory buffer point to the new buffer
if (!RemapMemBuffer(idec, data, data_size)) {
return VP8_STATUS_INVALID_PARAM;
}
return IDecode(idec);
}
//------------------------------------------------------------------------------
static const WebPDecBuffer* GetOutputBuffer(const WebPIDecoder* const idec) {
if (idec == NULL || idec->dec_ == NULL) {
return NULL;
}
if (idec->state_ <= STATE_VP8_PARTS0) {
return NULL;
}
if (idec->final_output_ != NULL) {
return NULL; // not yet slow-copied
}
return idec->params_.output;
}
const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* idec,
int* left, int* top,
int* width, int* height) {
const WebPDecBuffer* const src = GetOutputBuffer(idec);
if (left != NULL) *left = 0;
if (top != NULL) *top = 0;
if (src != NULL) {
if (width != NULL) *width = src->width;
if (height != NULL) *height = idec->params_.last_y;
} else {
if (width != NULL) *width = 0;
if (height != NULL) *height = 0;
}
return src;
}
uint8_t* WebPIDecGetRGB(const WebPIDecoder* idec, int* last_y,
int* width, int* height, int* stride) {
const WebPDecBuffer* const src = GetOutputBuffer(idec);
if (src == NULL) return NULL;
if (src->colorspace >= MODE_YUV) {
return NULL;
}
if (last_y != NULL) *last_y = idec->params_.last_y;
if (width != NULL) *width = src->width;
if (height != NULL) *height = src->height;
if (stride != NULL) *stride = src->u.RGBA.stride;
return src->u.RGBA.rgba;
}
uint8_t* WebPIDecGetYUVA(const WebPIDecoder* idec, int* last_y,
uint8_t** u, uint8_t** v, uint8_t** a,
int* width, int* height,
int* stride, int* uv_stride, int* a_stride) {
const WebPDecBuffer* const src = GetOutputBuffer(idec);
if (src == NULL) return NULL;
if (src->colorspace < MODE_YUV) {
return NULL;
}
if (last_y != NULL) *last_y = idec->params_.last_y;
if (u != NULL) *u = src->u.YUVA.u;
if (v != NULL) *v = src->u.YUVA.v;
if (a != NULL) *a = src->u.YUVA.a;
if (width != NULL) *width = src->width;
if (height != NULL) *height = src->height;
if (stride != NULL) *stride = src->u.YUVA.y_stride;
if (uv_stride != NULL) *uv_stride = src->u.YUVA.u_stride;
if (a_stride != NULL) *a_stride = src->u.YUVA.a_stride;
return src->u.YUVA.y;
}
int WebPISetIOHooks(WebPIDecoder* const idec,
VP8IoPutHook put,
VP8IoSetupHook setup,
VP8IoTeardownHook teardown,
void* user_data) {
if (idec == NULL || idec->state_ > STATE_WEBP_HEADER) {
return 0;
}
idec->io_.put = put;
idec->io_.setup = setup;
idec->io_.teardown = teardown;
idec->io_.opaque = user_data;
return 1;
}

644
vendor/github.com/sizeofint/webpanimation/dec_io_dec.c generated vendored Normal file
View File

@ -0,0 +1,644 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// functions for sample output.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include "dec_vp8i_dec.h"
#include "dec_webpi_dec.h"
#include "dsp_dsp.h"
#include "dsp_yuv.h"
#include "utils_utils.h"
//------------------------------------------------------------------------------
// Main YUV<->RGB conversion functions
static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) {
WebPDecBuffer* output = p->output;
const WebPYUVABuffer* const buf = &output->u.YUVA;
uint8_t* const y_dst = buf->y + (size_t)io->mb_y * buf->y_stride;
uint8_t* const u_dst = buf->u + (size_t)(io->mb_y >> 1) * buf->u_stride;
uint8_t* const v_dst = buf->v + (size_t)(io->mb_y >> 1) * buf->v_stride;
const int mb_w = io->mb_w;
const int mb_h = io->mb_h;
const int uv_w = (mb_w + 1) / 2;
const int uv_h = (mb_h + 1) / 2;
WebPCopyPlane(io->y, io->y_stride, y_dst, buf->y_stride, mb_w, mb_h);
WebPCopyPlane(io->u, io->uv_stride, u_dst, buf->u_stride, uv_w, uv_h);
WebPCopyPlane(io->v, io->uv_stride, v_dst, buf->v_stride, uv_w, uv_h);
return io->mb_h;
}
// Point-sampling U/V sampler.
static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) {
WebPDecBuffer* const output = p->output;
WebPRGBABuffer* const buf = &output->u.RGBA;
uint8_t* const dst = buf->rgba + (size_t)io->mb_y * buf->stride;
WebPSamplerProcessPlane(io->y, io->y_stride,
io->u, io->v, io->uv_stride,
dst, buf->stride, io->mb_w, io->mb_h,
WebPSamplers[output->colorspace]);
return io->mb_h;
}
//------------------------------------------------------------------------------
// Fancy upsampling
#ifdef FANCY_UPSAMPLING
static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
int num_lines_out = io->mb_h; // a priori guess
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
uint8_t* dst = buf->rgba + (size_t)io->mb_y * buf->stride;
WebPUpsampleLinePairFunc upsample = WebPUpsamplers[p->output->colorspace];
const uint8_t* cur_y = io->y;
const uint8_t* cur_u = io->u;
const uint8_t* cur_v = io->v;
const uint8_t* top_u = p->tmp_u;
const uint8_t* top_v = p->tmp_v;
int y = io->mb_y;
const int y_end = io->mb_y + io->mb_h;
const int mb_w = io->mb_w;
const int uv_w = (mb_w + 1) / 2;
if (y == 0) {
// First line is special cased. We mirror the u/v samples at boundary.
upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, mb_w);
} else {
// We can finish the left-over line from previous call.
upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
dst - buf->stride, dst, mb_w);
++num_lines_out;
}
// Loop over each output pairs of row.
for (; y + 2 < y_end; y += 2) {
top_u = cur_u;
top_v = cur_v;
cur_u += io->uv_stride;
cur_v += io->uv_stride;
dst += 2 * buf->stride;
cur_y += 2 * io->y_stride;
upsample(cur_y - io->y_stride, cur_y,
top_u, top_v, cur_u, cur_v,
dst - buf->stride, dst, mb_w);
}
// move to last row
cur_y += io->y_stride;
if (io->crop_top + y_end < io->crop_bottom) {
// Save the unfinished samples for next call (as we're not done yet).
memcpy(p->tmp_y, cur_y, mb_w * sizeof(*p->tmp_y));
memcpy(p->tmp_u, cur_u, uv_w * sizeof(*p->tmp_u));
memcpy(p->tmp_v, cur_v, uv_w * sizeof(*p->tmp_v));
// The fancy upsampler leaves a row unfinished behind
// (except for the very last row)
num_lines_out--;
} else {
// Process the very last row of even-sized picture
if (!(y_end & 1)) {
upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
dst + buf->stride, NULL, mb_w);
}
}
return num_lines_out;
}
#endif /* FANCY_UPSAMPLING */
//------------------------------------------------------------------------------
static void FillAlphaPlane(uint8_t* dst, int w, int h, int stride) {
int j;
for (j = 0; j < h; ++j) {
memset(dst, 0xff, w * sizeof(*dst));
dst += stride;
}
}
static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
int expected_num_lines_out) {
const uint8_t* alpha = io->a;
const WebPYUVABuffer* const buf = &p->output->u.YUVA;
const int mb_w = io->mb_w;
const int mb_h = io->mb_h;
uint8_t* dst = buf->a + (size_t)io->mb_y * buf->a_stride;
int j;
(void)expected_num_lines_out;
assert(expected_num_lines_out == mb_h);
if (alpha != NULL) {
for (j = 0; j < mb_h; ++j) {
memcpy(dst, alpha, mb_w * sizeof(*dst));
alpha += io->width;
dst += buf->a_stride;
}
} else if (buf->a != NULL) {
// the user requested alpha, but there is none, set it to opaque.
FillAlphaPlane(dst, mb_w, mb_h, buf->a_stride);
}
return 0;
}
static int GetAlphaSourceRow(const VP8Io* const io,
const uint8_t** alpha, int* const num_rows) {
int start_y = io->mb_y;
*num_rows = io->mb_h;
// Compensate for the 1-line delay of the fancy upscaler.
// This is similar to EmitFancyRGB().
if (io->fancy_upsampling) {
if (start_y == 0) {
// We don't process the last row yet. It'll be done during the next call.
--*num_rows;
} else {
--start_y;
// Fortunately, *alpha data is persistent, so we can go back
// one row and finish alpha blending, now that the fancy upscaler
// completed the YUV->RGB interpolation.
*alpha -= io->width;
}
if (io->crop_top + io->mb_y + io->mb_h == io->crop_bottom) {
// If it's the very last call, we process all the remaining rows!
*num_rows = io->crop_bottom - io->crop_top - start_y;
}
}
return start_y;
}
static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
int expected_num_lines_out) {
const uint8_t* alpha = io->a;
if (alpha != NULL) {
const int mb_w = io->mb_w;
const WEBP_CSP_MODE colorspace = p->output->colorspace;
const int alpha_first =
(colorspace == MODE_ARGB || colorspace == MODE_Argb);
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
int num_rows;
const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
uint8_t* const dst = base_rgba + (alpha_first ? 0 : 3);
const int has_alpha = WebPDispatchAlpha(alpha, io->width, mb_w,
num_rows, dst, buf->stride);
(void)expected_num_lines_out;
assert(expected_num_lines_out == num_rows);
// has_alpha is true if there's non-trivial alpha to premultiply with.
if (has_alpha && WebPIsPremultipliedMode(colorspace)) {
WebPApplyAlphaMultiply(base_rgba, alpha_first,
mb_w, num_rows, buf->stride);
}
}
return 0;
}
static int EmitAlphaRGBA4444(const VP8Io* const io, WebPDecParams* const p,
int expected_num_lines_out) {
const uint8_t* alpha = io->a;
if (alpha != NULL) {
const int mb_w = io->mb_w;
const WEBP_CSP_MODE colorspace = p->output->colorspace;
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
int num_rows;
const size_t start_y = GetAlphaSourceRow(io, &alpha, &num_rows);
uint8_t* const base_rgba = buf->rgba + start_y * buf->stride;
#if (WEBP_SWAP_16BIT_CSP == 1)
uint8_t* alpha_dst = base_rgba;
#else
uint8_t* alpha_dst = base_rgba + 1;
#endif
uint32_t alpha_mask = 0x0f;
int i, j;
for (j = 0; j < num_rows; ++j) {
for (i = 0; i < mb_w; ++i) {
// Fill in the alpha value (converted to 4 bits).
const uint32_t alpha_value = alpha[i] >> 4;
alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
alpha_mask &= alpha_value;
}
alpha += io->width;
alpha_dst += buf->stride;
}
(void)expected_num_lines_out;
assert(expected_num_lines_out == num_rows);
if (alpha_mask != 0x0f && WebPIsPremultipliedMode(colorspace)) {
WebPApplyAlphaMultiply4444(base_rgba, mb_w, num_rows, buf->stride);
}
}
return 0;
}
//------------------------------------------------------------------------------
// YUV rescaling (no final RGB conversion needed)
#if !defined(WEBP_REDUCE_SIZE)
static int Rescale(const uint8_t* src, int src_stride,
int new_lines, WebPRescaler* const wrk) {
int num_lines_out = 0;
while (new_lines > 0) { // import new contributions of source rows.
const int lines_in = WebPRescalerImport(wrk, new_lines, src, src_stride);
src += lines_in * src_stride;
new_lines -= lines_in;
num_lines_out += WebPRescalerExport(wrk); // emit output row(s)
}
return num_lines_out;
}
static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
const int mb_h = io->mb_h;
const int uv_mb_h = (mb_h + 1) >> 1;
WebPRescaler* const scaler = p->scaler_y;
int num_lines_out = 0;
if (WebPIsAlphaMode(p->output->colorspace) && io->a != NULL) {
// Before rescaling, we premultiply the luma directly into the io->y
// internal buffer. This is OK since these samples are not used for
// intra-prediction (the top samples are saved in cache_y_/u_/v_).
// But we need to cast the const away, though.
WebPMultRows((uint8_t*)io->y, io->y_stride,
io->a, io->width, io->mb_w, mb_h, 0);
}
num_lines_out = Rescale(io->y, io->y_stride, mb_h, scaler);
Rescale(io->u, io->uv_stride, uv_mb_h, p->scaler_u);
Rescale(io->v, io->uv_stride, uv_mb_h, p->scaler_v);
return num_lines_out;
}
static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
int expected_num_lines_out) {
const WebPYUVABuffer* const buf = &p->output->u.YUVA;
uint8_t* const dst_a = buf->a + (size_t)p->last_y * buf->a_stride;
if (io->a != NULL) {
uint8_t* const dst_y = buf->y + (size_t)p->last_y * buf->y_stride;
const int num_lines_out = Rescale(io->a, io->width, io->mb_h, p->scaler_a);
assert(expected_num_lines_out == num_lines_out);
if (num_lines_out > 0) { // unmultiply the Y
WebPMultRows(dst_y, buf->y_stride, dst_a, buf->a_stride,
p->scaler_a->dst_width, num_lines_out, 1);
}
} else if (buf->a != NULL) {
// the user requested alpha, but there is none, set it to opaque.
assert(p->last_y + expected_num_lines_out <= io->scaled_height);
FillAlphaPlane(dst_a, io->scaled_width, expected_num_lines_out,
buf->a_stride);
}
return 0;
}
static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
const WebPYUVABuffer* const buf = &p->output->u.YUVA;
const int out_width = io->scaled_width;
const int out_height = io->scaled_height;
const int uv_out_width = (out_width + 1) >> 1;
const int uv_out_height = (out_height + 1) >> 1;
const int uv_in_width = (io->mb_w + 1) >> 1;
const int uv_in_height = (io->mb_h + 1) >> 1;
const size_t work_size = 2 * out_width; // scratch memory for luma rescaler
const size_t uv_work_size = 2 * uv_out_width; // and for each u/v ones
size_t tmp_size, rescaler_size;
rescaler_t* work;
WebPRescaler* scalers;
const int num_rescalers = has_alpha ? 4 : 3;
tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
if (has_alpha) {
tmp_size += work_size * sizeof(*work);
}
rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
p->memory = WebPSafeMalloc(1ULL, tmp_size + rescaler_size);
if (p->memory == NULL) {
return 0; // memory error
}
work = (rescaler_t*)p->memory;
scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + tmp_size);
p->scaler_y = &scalers[0];
p->scaler_u = &scalers[1];
p->scaler_v = &scalers[2];
p->scaler_a = has_alpha ? &scalers[3] : NULL;
WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
buf->y, out_width, out_height, buf->y_stride, 1,
work);
WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
work + work_size);
WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
work + work_size + uv_work_size);
p->emit = EmitRescaledYUV;
if (has_alpha) {
WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
buf->a, out_width, out_height, buf->a_stride, 1,
work + work_size + 2 * uv_work_size);
p->emit_alpha = EmitRescaledAlphaYUV;
WebPInitAlphaProcessing();
}
return 1;
}
//------------------------------------------------------------------------------
// RGBA rescaling
static int ExportRGB(WebPDecParams* const p, int y_pos) {
const WebPYUV444Converter convert =
WebPYUV444Converters[p->output->colorspace];
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
uint8_t* dst = buf->rgba + (size_t)y_pos * buf->stride;
int num_lines_out = 0;
// For RGB rescaling, because of the YUV420, current scan position
// U/V can be +1/-1 line from the Y one. Hence the double test.
while (WebPRescalerHasPendingOutput(p->scaler_y) &&
WebPRescalerHasPendingOutput(p->scaler_u)) {
assert(y_pos + num_lines_out < p->output->height);
assert(p->scaler_u->y_accum == p->scaler_v->y_accum);
WebPRescalerExportRow(p->scaler_y);
WebPRescalerExportRow(p->scaler_u);
WebPRescalerExportRow(p->scaler_v);
convert(p->scaler_y->dst, p->scaler_u->dst, p->scaler_v->dst,
dst, p->scaler_y->dst_width);
dst += buf->stride;
++num_lines_out;
}
return num_lines_out;
}
static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
const int mb_h = io->mb_h;
const int uv_mb_h = (mb_h + 1) >> 1;
int j = 0, uv_j = 0;
int num_lines_out = 0;
while (j < mb_h) {
const int y_lines_in =
WebPRescalerImport(p->scaler_y, mb_h - j,
io->y + (size_t)j * io->y_stride, io->y_stride);
j += y_lines_in;
if (WebPRescaleNeededLines(p->scaler_u, uv_mb_h - uv_j)) {
const int u_lines_in = WebPRescalerImport(
p->scaler_u, uv_mb_h - uv_j, io->u + (size_t)uv_j * io->uv_stride,
io->uv_stride);
const int v_lines_in = WebPRescalerImport(
p->scaler_v, uv_mb_h - uv_j, io->v + (size_t)uv_j * io->uv_stride,
io->uv_stride);
(void)v_lines_in; // remove a gcc warning
assert(u_lines_in == v_lines_in);
uv_j += u_lines_in;
}
num_lines_out += ExportRGB(p, p->last_y + num_lines_out);
}
return num_lines_out;
}
static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
const WEBP_CSP_MODE colorspace = p->output->colorspace;
const int alpha_first =
(colorspace == MODE_ARGB || colorspace == MODE_Argb);
uint8_t* dst = base_rgba + (alpha_first ? 0 : 3);
int num_lines_out = 0;
const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
uint32_t non_opaque = 0;
const int width = p->scaler_a->dst_width;
while (WebPRescalerHasPendingOutput(p->scaler_a) &&
num_lines_out < max_lines_out) {
assert(y_pos + num_lines_out < p->output->height);
WebPRescalerExportRow(p->scaler_a);
non_opaque |= WebPDispatchAlpha(p->scaler_a->dst, 0, width, 1, dst, 0);
dst += buf->stride;
++num_lines_out;
}
if (is_premult_alpha && non_opaque) {
WebPApplyAlphaMultiply(base_rgba, alpha_first,
width, num_lines_out, buf->stride);
}
return num_lines_out;
}
static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
int max_lines_out) {
const WebPRGBABuffer* const buf = &p->output->u.RGBA;
uint8_t* const base_rgba = buf->rgba + (size_t)y_pos * buf->stride;
#if (WEBP_SWAP_16BIT_CSP == 1)
uint8_t* alpha_dst = base_rgba;
#else
uint8_t* alpha_dst = base_rgba + 1;
#endif
int num_lines_out = 0;
const WEBP_CSP_MODE colorspace = p->output->colorspace;
const int width = p->scaler_a->dst_width;
const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
uint32_t alpha_mask = 0x0f;
while (WebPRescalerHasPendingOutput(p->scaler_a) &&
num_lines_out < max_lines_out) {
int i;
assert(y_pos + num_lines_out < p->output->height);
WebPRescalerExportRow(p->scaler_a);
for (i = 0; i < width; ++i) {
// Fill in the alpha value (converted to 4 bits).
const uint32_t alpha_value = p->scaler_a->dst[i] >> 4;
alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
alpha_mask &= alpha_value;
}
alpha_dst += buf->stride;
++num_lines_out;
}
if (is_premult_alpha && alpha_mask != 0x0f) {
WebPApplyAlphaMultiply4444(base_rgba, width, num_lines_out, buf->stride);
}
return num_lines_out;
}
static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
int expected_num_out_lines) {
if (io->a != NULL) {
WebPRescaler* const scaler = p->scaler_a;
int lines_left = expected_num_out_lines;
const int y_end = p->last_y + lines_left;
while (lines_left > 0) {
const int64_t row_offset = (int64_t)scaler->src_y - io->mb_y;
WebPRescalerImport(scaler, io->mb_h + io->mb_y - scaler->src_y,
io->a + row_offset * io->width, io->width);
lines_left -= p->emit_alpha_row(p, y_end - lines_left, lines_left);
}
}
return 0;
}
static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
const int has_alpha = WebPIsAlphaMode(p->output->colorspace);
const int out_width = io->scaled_width;
const int out_height = io->scaled_height;
const int uv_in_width = (io->mb_w + 1) >> 1;
const int uv_in_height = (io->mb_h + 1) >> 1;
const size_t work_size = 2 * out_width; // scratch memory for one rescaler
rescaler_t* work; // rescalers work area
uint8_t* tmp; // tmp storage for scaled YUV444 samples before RGB conversion
size_t tmp_size1, tmp_size2, total_size, rescaler_size;
WebPRescaler* scalers;
const int num_rescalers = has_alpha ? 4 : 3;
tmp_size1 = 3 * work_size;
tmp_size2 = 3 * out_width;
if (has_alpha) {
tmp_size1 += work_size;
tmp_size2 += out_width;
}
total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
p->memory = WebPSafeMalloc(1ULL, total_size + rescaler_size);
if (p->memory == NULL) {
return 0; // memory error
}
work = (rescaler_t*)p->memory;
tmp = (uint8_t*)(work + tmp_size1);
scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + total_size);
p->scaler_y = &scalers[0];
p->scaler_u = &scalers[1];
p->scaler_v = &scalers[2];
p->scaler_a = has_alpha ? &scalers[3] : NULL;
WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
tmp + 0 * out_width, out_width, out_height, 0, 1,
work + 0 * work_size);
WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
tmp + 1 * out_width, out_width, out_height, 0, 1,
work + 1 * work_size);
WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
tmp + 2 * out_width, out_width, out_height, 0, 1,
work + 2 * work_size);
p->emit = EmitRescaledRGB;
WebPInitYUV444Converters();
if (has_alpha) {
WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
tmp + 3 * out_width, out_width, out_height, 0, 1,
work + 3 * work_size);
p->emit_alpha = EmitRescaledAlphaRGB;
if (p->output->colorspace == MODE_RGBA_4444 ||
p->output->colorspace == MODE_rgbA_4444) {
p->emit_alpha_row = ExportAlphaRGBA4444;
} else {
p->emit_alpha_row = ExportAlpha;
}
WebPInitAlphaProcessing();
}
return 1;
}
#endif // WEBP_REDUCE_SIZE
//------------------------------------------------------------------------------
// Default custom functions
static int CustomSetup(VP8Io* io) {
WebPDecParams* const p = (WebPDecParams*)io->opaque;
const WEBP_CSP_MODE colorspace = p->output->colorspace;
const int is_rgb = WebPIsRGBMode(colorspace);
const int is_alpha = WebPIsAlphaMode(colorspace);
p->memory = NULL;
p->emit = NULL;
p->emit_alpha = NULL;
p->emit_alpha_row = NULL;
if (!WebPIoInitFromOptions(p->options, io, is_alpha ? MODE_YUV : MODE_YUVA)) {
return 0;
}
if (is_alpha && WebPIsPremultipliedMode(colorspace)) {
WebPInitUpsamplers();
}
if (io->use_scaling) {
#if !defined(WEBP_REDUCE_SIZE)
const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p);
if (!ok) {
return 0; // memory error
}
#else
return 0; // rescaling support not compiled
#endif
} else {
if (is_rgb) {
WebPInitSamplers();
p->emit = EmitSampledRGB; // default
if (io->fancy_upsampling) {
#ifdef FANCY_UPSAMPLING
const int uv_width = (io->mb_w + 1) >> 1;
p->memory = WebPSafeMalloc(1ULL, (size_t)(io->mb_w + 2 * uv_width));
if (p->memory == NULL) {
return 0; // memory error.
}
p->tmp_y = (uint8_t*)p->memory;
p->tmp_u = p->tmp_y + io->mb_w;
p->tmp_v = p->tmp_u + uv_width;
p->emit = EmitFancyRGB;
WebPInitUpsamplers();
#endif
}
} else {
p->emit = EmitYUV;
}
if (is_alpha) { // need transparency output
p->emit_alpha =
(colorspace == MODE_RGBA_4444 || colorspace == MODE_rgbA_4444) ?
EmitAlphaRGBA4444
: is_rgb ? EmitAlphaRGB
: EmitAlphaYUV;
if (is_rgb) {
WebPInitAlphaProcessing();
}
}
}
return 1;
}
//------------------------------------------------------------------------------
static int CustomPut(const VP8Io* io) {
WebPDecParams* const p = (WebPDecParams*)io->opaque;
const int mb_w = io->mb_w;
const int mb_h = io->mb_h;
int num_lines_out;
assert(!(io->mb_y & 1));
if (mb_w <= 0 || mb_h <= 0) {
return 0;
}
num_lines_out = p->emit(io, p);
if (p->emit_alpha != NULL) {
p->emit_alpha(io, p, num_lines_out);
}
p->last_y += num_lines_out;
return 1;
}
//------------------------------------------------------------------------------
static void CustomTeardown(const VP8Io* io) {
WebPDecParams* const p = (WebPDecParams*)io->opaque;
WebPSafeFree(p->memory);
p->memory = NULL;
}
//------------------------------------------------------------------------------
// Main entry point
void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
io->put = CustomPut;
io->setup = CustomSetup;
io->teardown = CustomTeardown;
io->opaque = params;
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,115 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Quantizer initialization
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dec_vp8i_dec.h"
static WEBP_INLINE int clip(int v, int M) {
return v < 0 ? 0 : v > M ? M : v;
}
// Paragraph 14.1
static const uint8_t kDcTable[128] = {
4, 5, 6, 7, 8, 9, 10, 10,
11, 12, 13, 14, 15, 16, 17, 17,
18, 19, 20, 20, 21, 21, 22, 22,
23, 23, 24, 25, 25, 26, 27, 28,
29, 30, 31, 32, 33, 34, 35, 36,
37, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 46, 47, 48, 49, 50,
51, 52, 53, 54, 55, 56, 57, 58,
59, 60, 61, 62, 63, 64, 65, 66,
67, 68, 69, 70, 71, 72, 73, 74,
75, 76, 76, 77, 78, 79, 80, 81,
82, 83, 84, 85, 86, 87, 88, 89,
91, 93, 95, 96, 98, 100, 101, 102,
104, 106, 108, 110, 112, 114, 116, 118,
122, 124, 126, 128, 130, 132, 134, 136,
138, 140, 143, 145, 148, 151, 154, 157
};
static const uint16_t kAcTable[128] = {
4, 5, 6, 7, 8, 9, 10, 11,
12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27,
28, 29, 30, 31, 32, 33, 34, 35,
36, 37, 38, 39, 40, 41, 42, 43,
44, 45, 46, 47, 48, 49, 50, 51,
52, 53, 54, 55, 56, 57, 58, 60,
62, 64, 66, 68, 70, 72, 74, 76,
78, 80, 82, 84, 86, 88, 90, 92,
94, 96, 98, 100, 102, 104, 106, 108,
110, 112, 114, 116, 119, 122, 125, 128,
131, 134, 137, 140, 143, 146, 149, 152,
155, 158, 161, 164, 167, 170, 173, 177,
181, 185, 189, 193, 197, 201, 205, 209,
213, 217, 221, 225, 229, 234, 239, 245,
249, 254, 259, 264, 269, 274, 279, 284
};
//------------------------------------------------------------------------------
// Paragraph 9.6
void VP8ParseQuant(VP8Decoder* const dec) {
VP8BitReader* const br = &dec->br_;
const int base_q0 = VP8GetValue(br, 7, "global-header");
const int dqy1_dc = VP8Get(br, "global-header") ?
VP8GetSignedValue(br, 4, "global-header") : 0;
const int dqy2_dc = VP8Get(br, "global-header") ?
VP8GetSignedValue(br, 4, "global-header") : 0;
const int dqy2_ac = VP8Get(br, "global-header") ?
VP8GetSignedValue(br, 4, "global-header") : 0;
const int dquv_dc = VP8Get(br, "global-header") ?
VP8GetSignedValue(br, 4, "global-header") : 0;
const int dquv_ac = VP8Get(br, "global-header") ?
VP8GetSignedValue(br, 4, "global-header") : 0;
const VP8SegmentHeader* const hdr = &dec->segment_hdr_;
int i;
for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
int q;
if (hdr->use_segment_) {
q = hdr->quantizer_[i];
if (!hdr->absolute_delta_) {
q += base_q0;
}
} else {
if (i > 0) {
dec->dqm_[i] = dec->dqm_[0];
continue;
} else {
q = base_q0;
}
}
{
VP8QuantMatrix* const m = &dec->dqm_[i];
m->y1_mat_[0] = kDcTable[clip(q + dqy1_dc, 127)];
m->y1_mat_[1] = kAcTable[clip(q + 0, 127)];
m->y2_mat_[0] = kDcTable[clip(q + dqy2_dc, 127)] * 2;
// For all x in [0..284], x*155/100 is bitwise equal to (x*101581) >> 16.
// The smallest precision for that is '(x*6349) >> 12' but 16 is a good
// word size.
m->y2_mat_[1] = (kAcTable[clip(q + dqy2_ac, 127)] * 101581) >> 16;
if (m->y2_mat_[1] < 8) m->y2_mat_[1] = 8;
m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
m->uv_quant_ = q + dquv_ac; // for dithering strength evaluation
}
}
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,537 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Coding trees and probas
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dec_vp8i_dec.h"
#include "utils_bit_reader_inl_utils.h"
#if !defined(USE_GENERIC_TREE)
#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
// using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
#define USE_GENERIC_TREE 1 // ALTERNATE_CODE
#else
#define USE_GENERIC_TREE 0
#endif
#endif // USE_GENERIC_TREE
#if (USE_GENERIC_TREE == 1)
static const int8_t kYModesIntra4[18] = {
-B_DC_PRED, 1,
-B_TM_PRED, 2,
-B_VE_PRED, 3,
4, 6,
-B_HE_PRED, 5,
-B_RD_PRED, -B_VR_PRED,
-B_LD_PRED, 7,
-B_VL_PRED, 8,
-B_HD_PRED, -B_HU_PRED
};
#endif
//------------------------------------------------------------------------------
// Default probabilities
// Paragraph 13.5
static const uint8_t
CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
{ { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
{ 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
{ 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
},
{ { 1, 98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
{ 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
{ 78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 },
},
{ { 1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
{ 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
{ 77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 },
},
{ { 1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
{ 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
{ 37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
},
{ { 1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
{ 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
{ 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
},
{ { 1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
{ 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
{ 80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
},
{ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 246, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
}
},
{ { { 198, 35, 237, 223, 193, 187, 162, 160, 145, 155, 62 },
{ 131, 45, 198, 221, 172, 176, 220, 157, 252, 221, 1 },
{ 68, 47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
},
{ { 1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
{ 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
{ 81, 99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
},
{ { 1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
{ 99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
{ 23, 91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
},
{ { 1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
{ 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
{ 44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
},
{ { 1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
{ 94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
{ 22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
},
{ { 1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
{ 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
{ 35, 77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
},
{ { 1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
{ 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
{ 45, 99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
},
{ { 1, 1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
{ 203, 1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 137, 1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
}
},
{ { { 253, 9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
{ 175, 13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
{ 73, 17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
},
{ { 1, 95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
{ 239, 90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
{ 155, 77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
},
{ { 1, 24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
{ 201, 51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
{ 69, 46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
},
{ { 1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
{ 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
},
{ { 1, 16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 190, 36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
{ 149, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ { 1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ { 1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 213, 62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
{ 55, 93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
},
{ { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
}
},
{ { { 202, 24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
{ 126, 38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
{ 61, 46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
},
{ { 1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
{ 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
{ 39, 77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
},
{ { 1, 52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
{ 124, 74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
{ 24, 71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
},
{ { 1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
{ 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
{ 28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
},
{ { 1, 81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
{ 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
{ 20, 95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
},
{ { 1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
{ 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
{ 47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
},
{ { 1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
{ 141, 84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
{ 42, 80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
},
{ { 1, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 244, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
{ 238, 1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
}
}
};
// Paragraph 11.5
static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
{ { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
{ 152, 179, 64, 126, 170, 118, 46, 70, 95 },
{ 175, 69, 143, 80, 85, 82, 72, 155, 103 },
{ 56, 58, 10, 171, 218, 189, 17, 13, 152 },
{ 114, 26, 17, 163, 44, 195, 21, 10, 173 },
{ 121, 24, 80, 195, 26, 62, 44, 64, 85 },
{ 144, 71, 10, 38, 171, 213, 144, 34, 26 },
{ 170, 46, 55, 19, 136, 160, 33, 206, 71 },
{ 63, 20, 8, 114, 114, 208, 12, 9, 226 },
{ 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
{ { 134, 183, 89, 137, 98, 101, 106, 165, 148 },
{ 72, 187, 100, 130, 157, 111, 32, 75, 80 },
{ 66, 102, 167, 99, 74, 62, 40, 234, 128 },
{ 41, 53, 9, 178, 241, 141, 26, 8, 107 },
{ 74, 43, 26, 146, 73, 166, 49, 23, 157 },
{ 65, 38, 105, 160, 51, 52, 31, 115, 128 },
{ 104, 79, 12, 27, 217, 255, 87, 17, 7 },
{ 87, 68, 71, 44, 114, 51, 15, 186, 23 },
{ 47, 41, 14, 110, 182, 183, 21, 17, 194 },
{ 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
{ { 88, 88, 147, 150, 42, 46, 45, 196, 205 },
{ 43, 97, 183, 117, 85, 38, 35, 179, 61 },
{ 39, 53, 200, 87, 26, 21, 43, 232, 171 },
{ 56, 34, 51, 104, 114, 102, 29, 93, 77 },
{ 39, 28, 85, 171, 58, 165, 90, 98, 64 },
{ 34, 22, 116, 206, 23, 34, 43, 166, 73 },
{ 107, 54, 32, 26, 51, 1, 81, 43, 31 },
{ 68, 25, 106, 22, 64, 171, 36, 225, 114 },
{ 34, 19, 21, 102, 132, 188, 16, 76, 124 },
{ 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
{ { 193, 101, 35, 159, 215, 111, 89, 46, 111 },
{ 60, 148, 31, 172, 219, 228, 21, 18, 111 },
{ 112, 113, 77, 85, 179, 255, 38, 120, 114 },
{ 40, 42, 1, 196, 245, 209, 10, 25, 109 },
{ 88, 43, 29, 140, 166, 213, 37, 43, 154 },
{ 61, 63, 30, 155, 67, 45, 68, 1, 209 },
{ 100, 80, 8, 43, 154, 1, 51, 26, 71 },
{ 142, 78, 78, 16, 255, 128, 34, 197, 171 },
{ 41, 40, 5, 102, 211, 183, 4, 1, 221 },
{ 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
{ { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
{ 67, 87, 58, 169, 82, 115, 26, 59, 179 },
{ 63, 59, 90, 180, 59, 166, 93, 73, 154 },
{ 40, 40, 21, 116, 143, 209, 34, 39, 175 },
{ 47, 15, 16, 183, 34, 223, 49, 45, 183 },
{ 46, 17, 33, 183, 6, 98, 15, 32, 183 },
{ 57, 46, 22, 24, 128, 1, 54, 17, 37 },
{ 65, 32, 73, 115, 28, 128, 23, 128, 205 },
{ 40, 3, 9, 115, 51, 192, 18, 6, 223 },
{ 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
{ { 104, 55, 44, 218, 9, 54, 53, 130, 226 },
{ 64, 90, 70, 205, 40, 41, 23, 26, 57 },
{ 54, 57, 112, 184, 5, 41, 38, 166, 213 },
{ 30, 34, 26, 133, 152, 116, 10, 32, 134 },
{ 39, 19, 53, 221, 26, 114, 32, 73, 255 },
{ 31, 9, 65, 234, 2, 15, 1, 118, 73 },
{ 75, 32, 12, 51, 192, 255, 160, 43, 51 },
{ 88, 31, 35, 67, 102, 85, 55, 186, 85 },
{ 56, 21, 23, 111, 59, 205, 45, 37, 192 },
{ 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
{ { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
{ 95, 84, 53, 89, 128, 100, 113, 101, 45 },
{ 75, 79, 123, 47, 51, 128, 81, 171, 1 },
{ 57, 17, 5, 71, 102, 57, 53, 41, 49 },
{ 38, 33, 13, 121, 57, 73, 26, 1, 85 },
{ 41, 10, 67, 138, 77, 110, 90, 47, 114 },
{ 115, 21, 2, 10, 102, 255, 166, 23, 6 },
{ 101, 29, 16, 10, 85, 128, 101, 196, 26 },
{ 57, 18, 10, 102, 102, 213, 34, 20, 43 },
{ 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
{ { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
{ 69, 60, 71, 38, 73, 119, 28, 222, 37 },
{ 68, 45, 128, 34, 1, 47, 11, 245, 171 },
{ 62, 17, 19, 70, 146, 85, 55, 62, 70 },
{ 37, 43, 37, 154, 100, 163, 85, 160, 1 },
{ 63, 9, 92, 136, 28, 64, 32, 201, 85 },
{ 75, 15, 9, 9, 64, 255, 184, 119, 16 },
{ 86, 6, 28, 5, 64, 255, 25, 248, 1 },
{ 56, 8, 17, 132, 137, 255, 55, 116, 128 },
{ 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
{ { 164, 50, 31, 137, 154, 133, 25, 35, 218 },
{ 51, 103, 44, 131, 131, 123, 31, 6, 158 },
{ 86, 40, 64, 135, 148, 224, 45, 183, 128 },
{ 22, 26, 17, 131, 240, 154, 14, 1, 209 },
{ 45, 16, 21, 91, 64, 222, 7, 1, 197 },
{ 56, 21, 39, 155, 60, 138, 23, 102, 213 },
{ 83, 12, 13, 54, 192, 255, 68, 47, 28 },
{ 85, 26, 85, 85, 128, 128, 32, 146, 171 },
{ 18, 11, 7, 63, 144, 171, 4, 4, 246 },
{ 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
{ { 190, 80, 35, 99, 180, 80, 126, 54, 45 },
{ 85, 126, 47, 87, 176, 51, 41, 20, 32 },
{ 101, 75, 128, 139, 118, 146, 116, 128, 85 },
{ 56, 41, 15, 176, 236, 85, 37, 9, 62 },
{ 71, 30, 17, 119, 118, 255, 17, 18, 138 },
{ 101, 38, 60, 138, 55, 70, 43, 26, 142 },
{ 146, 36, 19, 30, 171, 255, 97, 27, 20 },
{ 138, 45, 61, 62, 219, 1, 81, 188, 64 },
{ 32, 41, 20, 117, 151, 142, 20, 21, 163 },
{ 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
};
void VP8ResetProba(VP8Proba* const proba) {
memset(proba->segments_, 255u, sizeof(proba->segments_));
// proba->bands_[][] is initialized later
}
static void ParseIntraMode(VP8BitReader* const br,
VP8Decoder* const dec, int mb_x) {
uint8_t* const top = dec->intra_t_ + 4 * mb_x;
uint8_t* const left = dec->intra_l_;
VP8MBData* const block = dec->mb_data_ + mb_x;
// Note: we don't save segment map (yet), as we don't expect
// to decode more than 1 keyframe.
if (dec->segment_hdr_.update_map_) {
// Hardcoded tree parsing
block->segment_ = !VP8GetBit(br, dec->proba_.segments_[0], "segments")
? VP8GetBit(br, dec->proba_.segments_[1], "segments")
: VP8GetBit(br, dec->proba_.segments_[2], "segments") + 2;
} else {
block->segment_ = 0; // default for intra
}
if (dec->use_skip_proba_) block->skip_ = VP8GetBit(br, dec->skip_p_, "skip");
block->is_i4x4_ = !VP8GetBit(br, 145, "block-size");
if (!block->is_i4x4_) {
// Hardcoded 16x16 intra-mode decision tree.
const int ymode =
VP8GetBit(br, 156, "pred-modes") ?
(VP8GetBit(br, 128, "pred-modes") ? TM_PRED : H_PRED) :
(VP8GetBit(br, 163, "pred-modes") ? V_PRED : DC_PRED);
block->imodes_[0] = ymode;
memset(top, ymode, 4 * sizeof(*top));
memset(left, ymode, 4 * sizeof(*left));
} else {
uint8_t* modes = block->imodes_;
int y;
for (y = 0; y < 4; ++y) {
int ymode = left[y];
int x;
for (x = 0; x < 4; ++x) {
const uint8_t* const prob = kBModesProba[top[x]][ymode];
#if (USE_GENERIC_TREE == 1)
// Generic tree-parsing
int i = kYModesIntra4[VP8GetBit(br, prob[0], "pred-modes")];
while (i > 0) {
i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i], "pred-modes")];
}
ymode = -i;
#else
// Hardcoded tree parsing
ymode = !VP8GetBit(br, prob[0], "pred-modes") ? B_DC_PRED :
!VP8GetBit(br, prob[1], "pred-modes") ? B_TM_PRED :
!VP8GetBit(br, prob[2], "pred-modes") ? B_VE_PRED :
!VP8GetBit(br, prob[3], "pred-modes") ?
(!VP8GetBit(br, prob[4], "pred-modes") ? B_HE_PRED :
(!VP8GetBit(br, prob[5], "pred-modes") ? B_RD_PRED
: B_VR_PRED)) :
(!VP8GetBit(br, prob[6], "pred-modes") ? B_LD_PRED :
(!VP8GetBit(br, prob[7], "pred-modes") ? B_VL_PRED :
(!VP8GetBit(br, prob[8], "pred-modes") ? B_HD_PRED
: B_HU_PRED))
);
#endif // USE_GENERIC_TREE
top[x] = ymode;
}
memcpy(modes, top, 4 * sizeof(*top));
modes += 4;
left[y] = ymode;
}
}
// Hardcoded UVMode decision tree
block->uvmode_ = !VP8GetBit(br, 142, "pred-modes-uv") ? DC_PRED
: !VP8GetBit(br, 114, "pred-modes-uv") ? V_PRED
: VP8GetBit(br, 183, "pred-modes-uv") ? TM_PRED : H_PRED;
}
int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec) {
int mb_x;
for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
ParseIntraMode(br, dec, mb_x);
}
return !dec->br_.eof_;
}
//------------------------------------------------------------------------------
// Paragraph 13
static const uint8_t
CoeffsUpdateProba[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
{ { { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 176, 246, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 223, 241, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 249, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 244, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 234, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 246, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 239, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 251, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 251, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 253, 255, 254, 255, 255, 255, 255, 255, 255 },
{ 250, 255, 254, 255, 254, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
}
},
{ { { 217, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 225, 252, 241, 253, 255, 255, 254, 255, 255, 255, 255 },
{ 234, 250, 241, 250, 253, 255, 253, 254, 255, 255, 255 }
},
{ { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 223, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 238, 253, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 248, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 249, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 253, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 247, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 252, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 253, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
}
},
{ { { 186, 251, 250, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 234, 251, 244, 254, 255, 255, 255, 255, 255, 255, 255 },
{ 251, 251, 243, 253, 254, 255, 254, 255, 255, 255, 255 }
},
{ { 255, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 236, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 251, 253, 253, 254, 254, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 254, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
}
},
{ { { 248, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 250, 254, 252, 254, 255, 255, 255, 255, 255, 255, 255 },
{ 248, 254, 249, 253, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 246, 253, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 252, 254, 251, 254, 254, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 254, 252, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 248, 254, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 253, 255, 254, 254, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 245, 251, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 253, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 251, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 252, 253, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 254, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 252, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 249, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 254, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 253, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 250, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
},
{ { 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 254, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 },
{ 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255 }
}
}
};
// Paragraph 9.9
static const uint8_t kBands[16 + 1] = {
0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
0 // extra entry as sentinel
};
void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
VP8Proba* const proba = &dec->proba_;
int t, b, c, p;
for (t = 0; t < NUM_TYPES; ++t) {
for (b = 0; b < NUM_BANDS; ++b) {
for (c = 0; c < NUM_CTX; ++c) {
for (p = 0; p < NUM_PROBAS; ++p) {
const int v =
VP8GetBit(br, CoeffsUpdateProba[t][b][c][p], "global-header") ?
VP8GetValue(br, 8, "global-header") :
CoeffsProba0[t][b][c][p];
proba->bands_[t][b].probas_[c][p] = v;
}
}
}
for (b = 0; b < 16 + 1; ++b) {
proba->bands_ptr_[t][b] = &proba->bands_[t][kBands[b]];
}
}
dec->use_skip_proba_ = VP8Get(br, "global-header");
if (dec->use_skip_proba_) {
dec->skip_p_ = VP8GetValue(br, 8, "global-header");
}
}

722
vendor/github.com/sizeofint/webpanimation/dec_vp8_dec.c generated vendored Normal file
View File

@ -0,0 +1,722 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// main entry for the decoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "dec_alphai_dec.h"
#include "dec_vp8i_dec.h"
#include "dec_vp8li_dec.h"
#include "dec_webpi_dec.h"
#include "utils_bit_reader_inl_utils.h"
#include "utils_utils.h"
//------------------------------------------------------------------------------
int WebPGetDecoderVersion(void) {
return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION;
}
//------------------------------------------------------------------------------
// Signature and pointer-to-function for GetCoeffs() variants below.
typedef int (*GetCoeffsFunc)(VP8BitReader* const br,
const VP8BandProbas* const prob[],
int ctx, const quant_t dq, int n, int16_t* out);
static volatile GetCoeffsFunc GetCoeffs = NULL;
static void InitGetCoeffs(void);
//------------------------------------------------------------------------------
// VP8Decoder
static void SetOk(VP8Decoder* const dec) {
dec->status_ = VP8_STATUS_OK;
dec->error_msg_ = "OK";
}
int VP8InitIoInternal(VP8Io* const io, int version) {
if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
return 0; // mismatch error
}
if (io != NULL) {
memset(io, 0, sizeof(*io));
}
return 1;
}
VP8Decoder* VP8New(void) {
VP8Decoder* const dec = (VP8Decoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
if (dec != NULL) {
SetOk(dec);
WebPGetWorkerInterface()->Init(&dec->worker_);
dec->ready_ = 0;
dec->num_parts_minus_one_ = 0;
InitGetCoeffs();
}
return dec;
}
VP8StatusCode VP8Status(VP8Decoder* const dec) {
if (!dec) return VP8_STATUS_INVALID_PARAM;
return dec->status_;
}
const char* VP8StatusMessage(VP8Decoder* const dec) {
if (dec == NULL) return "no object";
if (!dec->error_msg_) return "OK";
return dec->error_msg_;
}
void VP8Delete(VP8Decoder* const dec) {
if (dec != NULL) {
VP8Clear(dec);
WebPSafeFree(dec);
}
}
int VP8SetError(VP8Decoder* const dec,
VP8StatusCode error, const char* const msg) {
// The oldest error reported takes precedence over the new one.
if (dec->status_ == VP8_STATUS_OK) {
dec->status_ = error;
dec->error_msg_ = msg;
dec->ready_ = 0;
}
return 0;
}
//------------------------------------------------------------------------------
int VP8CheckSignature(const uint8_t* const data, size_t data_size) {
return (data_size >= 3 &&
data[0] == 0x9d && data[1] == 0x01 && data[2] == 0x2a);
}
int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
int* const width, int* const height) {
if (data == NULL || data_size < VP8_FRAME_HEADER_SIZE) {
return 0; // not enough data
}
// check signature
if (!VP8CheckSignature(data + 3, data_size - 3)) {
return 0; // Wrong signature.
} else {
const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16);
const int key_frame = !(bits & 1);
const int w = ((data[7] << 8) | data[6]) & 0x3fff;
const int h = ((data[9] << 8) | data[8]) & 0x3fff;
if (!key_frame) { // Not a keyframe.
return 0;
}
if (((bits >> 1) & 7) > 3) {
return 0; // unknown profile
}
if (!((bits >> 4) & 1)) {
return 0; // first frame is invisible!
}
if (((bits >> 5)) >= chunk_size) { // partition_length
return 0; // inconsistent size information.
}
if (w == 0 || h == 0) {
return 0; // We don't support both width and height to be zero.
}
if (width) {
*width = w;
}
if (height) {
*height = h;
}
return 1;
}
}
//------------------------------------------------------------------------------
// Header parsing
static void ResetSegmentHeader(VP8SegmentHeader* const hdr) {
assert(hdr != NULL);
hdr->use_segment_ = 0;
hdr->update_map_ = 0;
hdr->absolute_delta_ = 1;
memset(hdr->quantizer_, 0, sizeof(hdr->quantizer_));
memset(hdr->filter_strength_, 0, sizeof(hdr->filter_strength_));
}
// Paragraph 9.3
static int ParseSegmentHeader(VP8BitReader* br,
VP8SegmentHeader* hdr, VP8Proba* proba) {
assert(br != NULL);
assert(hdr != NULL);
hdr->use_segment_ = VP8Get(br, "global-header");
if (hdr->use_segment_) {
hdr->update_map_ = VP8Get(br, "global-header");
if (VP8Get(br, "global-header")) { // update data
int s;
hdr->absolute_delta_ = VP8Get(br, "global-header");
for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
hdr->quantizer_[s] = VP8Get(br, "global-header") ?
VP8GetSignedValue(br, 7, "global-header") : 0;
}
for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
hdr->filter_strength_[s] = VP8Get(br, "global-header") ?
VP8GetSignedValue(br, 6, "global-header") : 0;
}
}
if (hdr->update_map_) {
int s;
for (s = 0; s < MB_FEATURE_TREE_PROBS; ++s) {
proba->segments_[s] = VP8Get(br, "global-header") ?
VP8GetValue(br, 8, "global-header") : 255u;
}
}
} else {
hdr->update_map_ = 0;
}
return !br->eof_;
}
// Paragraph 9.5
// This function returns VP8_STATUS_SUSPENDED if we don't have all the
// necessary data in 'buf'.
// This case is not necessarily an error (for incremental decoding).
// Still, no bitreader is ever initialized to make it possible to read
// unavailable memory.
// If we don't even have the partitions' sizes, than VP8_STATUS_NOT_ENOUGH_DATA
// is returned, and this is an unrecoverable error.
// If the partitions were positioned ok, VP8_STATUS_OK is returned.
static VP8StatusCode ParsePartitions(VP8Decoder* const dec,
const uint8_t* buf, size_t size) {
VP8BitReader* const br = &dec->br_;
const uint8_t* sz = buf;
const uint8_t* buf_end = buf + size;
const uint8_t* part_start;
size_t size_left = size;
size_t last_part;
size_t p;
dec->num_parts_minus_one_ = (1 << VP8GetValue(br, 2, "global-header")) - 1;
last_part = dec->num_parts_minus_one_;
if (size < 3 * last_part) {
// we can't even read the sizes with sz[]! That's a failure.
return VP8_STATUS_NOT_ENOUGH_DATA;
}
part_start = buf + last_part * 3;
size_left -= last_part * 3;
for (p = 0; p < last_part; ++p) {
size_t psize = sz[0] | (sz[1] << 8) | (sz[2] << 16);
if (psize > size_left) psize = size_left;
VP8InitBitReader(dec->parts_ + p, part_start, psize);
part_start += psize;
size_left -= psize;
sz += 3;
}
VP8InitBitReader(dec->parts_ + last_part, part_start, size_left);
return (part_start < buf_end) ? VP8_STATUS_OK :
VP8_STATUS_SUSPENDED; // Init is ok, but there's not enough data
}
// Paragraph 9.4
static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) {
VP8FilterHeader* const hdr = &dec->filter_hdr_;
hdr->simple_ = VP8Get(br, "global-header");
hdr->level_ = VP8GetValue(br, 6, "global-header");
hdr->sharpness_ = VP8GetValue(br, 3, "global-header");
hdr->use_lf_delta_ = VP8Get(br, "global-header");
if (hdr->use_lf_delta_) {
if (VP8Get(br, "global-header")) { // update lf-delta?
int i;
for (i = 0; i < NUM_REF_LF_DELTAS; ++i) {
if (VP8Get(br, "global-header")) {
hdr->ref_lf_delta_[i] = VP8GetSignedValue(br, 6, "global-header");
}
}
for (i = 0; i < NUM_MODE_LF_DELTAS; ++i) {
if (VP8Get(br, "global-header")) {
hdr->mode_lf_delta_[i] = VP8GetSignedValue(br, 6, "global-header");
}
}
}
}
dec->filter_type_ = (hdr->level_ == 0) ? 0 : hdr->simple_ ? 1 : 2;
return !br->eof_;
}
// Topmost call
int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
const uint8_t* buf;
size_t buf_size;
VP8FrameHeader* frm_hdr;
VP8PictureHeader* pic_hdr;
VP8BitReader* br;
VP8StatusCode status;
if (dec == NULL) {
return 0;
}
SetOk(dec);
if (io == NULL) {
return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
"null VP8Io passed to VP8GetHeaders()");
}
buf = io->data;
buf_size = io->data_size;
if (buf_size < 4) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"Truncated header.");
}
// Paragraph 9.1
{
const uint32_t bits = buf[0] | (buf[1] << 8) | (buf[2] << 16);
frm_hdr = &dec->frm_hdr_;
frm_hdr->key_frame_ = !(bits & 1);
frm_hdr->profile_ = (bits >> 1) & 7;
frm_hdr->show_ = (bits >> 4) & 1;
frm_hdr->partition_length_ = (bits >> 5);
if (frm_hdr->profile_ > 3) {
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
"Incorrect keyframe parameters.");
}
if (!frm_hdr->show_) {
return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
"Frame not displayable.");
}
buf += 3;
buf_size -= 3;
}
pic_hdr = &dec->pic_hdr_;
if (frm_hdr->key_frame_) {
// Paragraph 9.2
if (buf_size < 7) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"cannot parse picture header");
}
if (!VP8CheckSignature(buf, buf_size)) {
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
"Bad code word");
}
pic_hdr->width_ = ((buf[4] << 8) | buf[3]) & 0x3fff;
pic_hdr->xscale_ = buf[4] >> 6; // ratio: 1, 5/4 5/3 or 2
pic_hdr->height_ = ((buf[6] << 8) | buf[5]) & 0x3fff;
pic_hdr->yscale_ = buf[6] >> 6;
buf += 7;
buf_size -= 7;
dec->mb_w_ = (pic_hdr->width_ + 15) >> 4;
dec->mb_h_ = (pic_hdr->height_ + 15) >> 4;
// Setup default output area (can be later modified during io->setup())
io->width = pic_hdr->width_;
io->height = pic_hdr->height_;
// IMPORTANT! use some sane dimensions in crop_* and scaled_* fields.
// So they can be used interchangeably without always testing for
// 'use_cropping'.
io->use_cropping = 0;
io->crop_top = 0;
io->crop_left = 0;
io->crop_right = io->width;
io->crop_bottom = io->height;
io->use_scaling = 0;
io->scaled_width = io->width;
io->scaled_height = io->height;
io->mb_w = io->width; // sanity check
io->mb_h = io->height; // ditto
VP8ResetProba(&dec->proba_);
ResetSegmentHeader(&dec->segment_hdr_);
}
// Check if we have all the partition #0 available, and initialize dec->br_
// to read this partition (and this partition only).
if (frm_hdr->partition_length_ > buf_size) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"bad partition length");
}
br = &dec->br_;
VP8InitBitReader(br, buf, frm_hdr->partition_length_);
buf += frm_hdr->partition_length_;
buf_size -= frm_hdr->partition_length_;
if (frm_hdr->key_frame_) {
pic_hdr->colorspace_ = VP8Get(br, "global-header");
pic_hdr->clamp_type_ = VP8Get(br, "global-header");
}
if (!ParseSegmentHeader(br, &dec->segment_hdr_, &dec->proba_)) {
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
"cannot parse segment header");
}
// Filter specs
if (!ParseFilterHeader(br, dec)) {
return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
"cannot parse filter header");
}
status = ParsePartitions(dec, buf, buf_size);
if (status != VP8_STATUS_OK) {
return VP8SetError(dec, status, "cannot parse partitions");
}
// quantizer change
VP8ParseQuant(dec);
// Frame buffer marking
if (!frm_hdr->key_frame_) {
return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
"Not a key frame.");
}
VP8Get(br, "global-header"); // ignore the value of update_proba_
VP8ParseProba(br, dec);
// sanitized state
dec->ready_ = 1;
return 1;
}
//------------------------------------------------------------------------------
// Residual decoding (Paragraph 13.2 / 13.3)
static const uint8_t kCat3[] = { 173, 148, 140, 0 };
static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
static const uint8_t kCat6[] =
{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
static const uint8_t kZigzag[16] = {
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
};
// See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
int v;
if (!VP8GetBit(br, p[3], "coeffs")) {
if (!VP8GetBit(br, p[4], "coeffs")) {
v = 2;
} else {
v = 3 + VP8GetBit(br, p[5], "coeffs");
}
} else {
if (!VP8GetBit(br, p[6], "coeffs")) {
if (!VP8GetBit(br, p[7], "coeffs")) {
v = 5 + VP8GetBit(br, 159, "coeffs");
} else {
v = 7 + 2 * VP8GetBit(br, 165, "coeffs");
v += VP8GetBit(br, 145, "coeffs");
}
} else {
const uint8_t* tab;
const int bit1 = VP8GetBit(br, p[8], "coeffs");
const int bit0 = VP8GetBit(br, p[9 + bit1], "coeffs");
const int cat = 2 * bit1 + bit0;
v = 0;
for (tab = kCat3456[cat]; *tab; ++tab) {
v += v + VP8GetBit(br, *tab, "coeffs");
}
v += 3 + (8 << cat);
}
}
return v;
}
// Returns the position of the last non-zero coeff plus one
static int GetCoeffsFast(VP8BitReader* const br,
const VP8BandProbas* const prob[],
int ctx, const quant_t dq, int n, int16_t* out) {
const uint8_t* p = prob[n]->probas_[ctx];
for (; n < 16; ++n) {
if (!VP8GetBit(br, p[0], "coeffs")) {
return n; // previous coeff was last non-zero coeff
}
while (!VP8GetBit(br, p[1], "coeffs")) { // sequence of zero coeffs
p = prob[++n]->probas_[0];
if (n == 16) return 16;
}
{ // non zero coeff
const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
int v;
if (!VP8GetBit(br, p[2], "coeffs")) {
v = 1;
p = p_ctx[1];
} else {
v = GetLargeValue(br, p);
p = p_ctx[2];
}
out[kZigzag[n]] = VP8GetSigned(br, v, "coeffs") * dq[n > 0];
}
}
return 16;
}
// This version of GetCoeffs() uses VP8GetBitAlt() which is an alternate version
// of VP8GetBitAlt() targeting specific platforms.
static int GetCoeffsAlt(VP8BitReader* const br,
const VP8BandProbas* const prob[],
int ctx, const quant_t dq, int n, int16_t* out) {
const uint8_t* p = prob[n]->probas_[ctx];
for (; n < 16; ++n) {
if (!VP8GetBitAlt(br, p[0], "coeffs")) {
return n; // previous coeff was last non-zero coeff
}
while (!VP8GetBitAlt(br, p[1], "coeffs")) { // sequence of zero coeffs
p = prob[++n]->probas_[0];
if (n == 16) return 16;
}
{ // non zero coeff
const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
int v;
if (!VP8GetBitAlt(br, p[2], "coeffs")) {
v = 1;
p = p_ctx[1];
} else {
v = GetLargeValue(br, p);
p = p_ctx[2];
}
out[kZigzag[n]] = VP8GetSigned(br, v, "coeffs") * dq[n > 0];
}
}
return 16;
}
WEBP_DSP_INIT_FUNC(InitGetCoeffs) {
if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
GetCoeffs = GetCoeffsAlt;
} else {
GetCoeffs = GetCoeffsFast;
}
}
static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
nz_coeffs <<= 2;
nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
return nz_coeffs;
}
static int ParseResiduals(VP8Decoder* const dec,
VP8MB* const mb, VP8BitReader* const token_br) {
const VP8BandProbas* (* const bands)[16 + 1] = dec->proba_.bands_ptr_;
const VP8BandProbas* const * ac_proba;
VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
const VP8QuantMatrix* const q = &dec->dqm_[block->segment_];
int16_t* dst = block->coeffs_;
VP8MB* const left_mb = dec->mb_info_ - 1;
uint8_t tnz, lnz;
uint32_t non_zero_y = 0;
uint32_t non_zero_uv = 0;
int x, y, ch;
uint32_t out_t_nz, out_l_nz;
int first;
memset(dst, 0, 384 * sizeof(*dst));
if (!block->is_i4x4_) { // parse DC
int16_t dc[16] = { 0 };
const int ctx = mb->nz_dc_ + left_mb->nz_dc_;
const int nz = GetCoeffs(token_br, bands[1], ctx, q->y2_mat_, 0, dc);
mb->nz_dc_ = left_mb->nz_dc_ = (nz > 0);
if (nz > 1) { // more than just the DC -> perform the full transform
VP8TransformWHT(dc, dst);
} else { // only DC is non-zero -> inlined simplified transform
int i;
const int dc0 = (dc[0] + 3) >> 3;
for (i = 0; i < 16 * 16; i += 16) dst[i] = dc0;
}
first = 1;
ac_proba = bands[0];
} else {
first = 0;
ac_proba = bands[3];
}
tnz = mb->nz_ & 0x0f;
lnz = left_mb->nz_ & 0x0f;
for (y = 0; y < 4; ++y) {
int l = lnz & 1;
uint32_t nz_coeffs = 0;
for (x = 0; x < 4; ++x) {
const int ctx = l + (tnz & 1);
const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
l = (nz > first);
tnz = (tnz >> 1) | (l << 7);
nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
dst += 16;
}
tnz >>= 4;
lnz = (lnz >> 1) | (l << 7);
non_zero_y = (non_zero_y << 8) | nz_coeffs;
}
out_t_nz = tnz;
out_l_nz = lnz >> 4;
for (ch = 0; ch < 4; ch += 2) {
uint32_t nz_coeffs = 0;
tnz = mb->nz_ >> (4 + ch);
lnz = left_mb->nz_ >> (4 + ch);
for (y = 0; y < 2; ++y) {
int l = lnz & 1;
for (x = 0; x < 2; ++x) {
const int ctx = l + (tnz & 1);
const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
l = (nz > 0);
tnz = (tnz >> 1) | (l << 3);
nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
dst += 16;
}
tnz >>= 2;
lnz = (lnz >> 1) | (l << 5);
}
// Note: we don't really need the per-4x4 details for U/V blocks.
non_zero_uv |= nz_coeffs << (4 * ch);
out_t_nz |= (tnz << 4) << ch;
out_l_nz |= (lnz & 0xf0) << ch;
}
mb->nz_ = out_t_nz;
left_mb->nz_ = out_l_nz;
block->non_zero_y_ = non_zero_y;
block->non_zero_uv_ = non_zero_uv;
// We look at the mode-code of each block and check if some blocks have less
// than three non-zero coeffs (code < 2). This is to avoid dithering flat and
// empty blocks.
block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
return !(non_zero_y | non_zero_uv); // will be used for further optimization
}
//------------------------------------------------------------------------------
// Main loop
int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
VP8MB* const left = dec->mb_info_ - 1;
VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
int skip = dec->use_skip_proba_ ? block->skip_ : 0;
if (!skip) {
skip = ParseResiduals(dec, mb, token_br);
} else {
left->nz_ = mb->nz_ = 0;
if (!block->is_i4x4_) {
left->nz_dc_ = mb->nz_dc_ = 0;
}
block->non_zero_y_ = 0;
block->non_zero_uv_ = 0;
block->dither_ = 0;
}
if (dec->filter_type_ > 0) { // store filter info
VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
*finfo = dec->fstrengths_[block->segment_][block->is_i4x4_];
finfo->f_inner_ |= !skip;
}
return !token_br->eof_;
}
void VP8InitScanline(VP8Decoder* const dec) {
VP8MB* const left = dec->mb_info_ - 1;
left->nz_ = 0;
left->nz_dc_ = 0;
memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
dec->mb_x_ = 0;
}
static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
// Parse bitstream for this row.
VP8BitReader* const token_br =
&dec->parts_[dec->mb_y_ & dec->num_parts_minus_one_];
if (!VP8ParseIntraModeRow(&dec->br_, dec)) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"Premature end-of-partition0 encountered.");
}
for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
if (!VP8DecodeMB(dec, token_br)) {
return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
"Premature end-of-file encountered.");
}
}
VP8InitScanline(dec); // Prepare for next scanline
// Reconstruct, filter and emit the row.
if (!VP8ProcessRow(dec, io)) {
return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
}
}
if (dec->mt_method_ > 0) {
if (!WebPGetWorkerInterface()->Sync(&dec->worker_)) return 0;
}
return 1;
}
// Main entry point
int VP8Decode(VP8Decoder* const dec, VP8Io* const io) {
int ok = 0;
if (dec == NULL) {
return 0;
}
if (io == NULL) {
return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
"NULL VP8Io parameter in VP8Decode().");
}
if (!dec->ready_) {
if (!VP8GetHeaders(dec, io)) {
return 0;
}
}
assert(dec->ready_);
// Finish setting up the decoding parameter. Will call io->setup().
ok = (VP8EnterCritical(dec, io) == VP8_STATUS_OK);
if (ok) { // good to go.
// Will allocate memory and prepare everything.
if (ok) ok = VP8InitFrame(dec, io);
// Main decoding loop
if (ok) ok = ParseFrame(dec, io);
// Exit.
ok &= VP8ExitCritical(dec, io);
}
if (!ok) {
VP8Clear(dec);
return 0;
}
dec->ready_ = 0;
return ok;
}
void VP8Clear(VP8Decoder* const dec) {
if (dec == NULL) {
return;
}
WebPGetWorkerInterface()->End(&dec->worker_);
WebPDeallocateAlphaMemory(dec);
WebPSafeFree(dec->mem_);
dec->mem_ = NULL;
dec->mem_size_ = 0;
memset(&dec->br_, 0, sizeof(dec->br_));
dec->ready_ = 0;
}
//------------------------------------------------------------------------------

185
vendor/github.com/sizeofint/webpanimation/dec_vp8_dec.h generated vendored Normal file
View File

@ -0,0 +1,185 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Low-level API for VP8 decoder
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DEC_VP8_DEC_H_
#define WEBP_DEC_VP8_DEC_H_
#include "webp_decode.h"
#ifdef __cplusplus
extern "C" {
#endif
//------------------------------------------------------------------------------
// Lower-level API
//
// These functions provide fine-grained control of the decoding process.
// The call flow should resemble:
//
// VP8Io io;
// VP8InitIo(&io);
// io.data = data;
// io.data_size = size;
// /* customize io's functions (setup()/put()/teardown()) if needed. */
//
// VP8Decoder* dec = VP8New();
// int ok = VP8Decode(dec, &io);
// if (!ok) printf("Error: %s\n", VP8StatusMessage(dec));
// VP8Delete(dec);
// return ok;
// Input / Output
typedef struct VP8Io VP8Io;
typedef int (*VP8IoPutHook)(const VP8Io* io);
typedef int (*VP8IoSetupHook)(VP8Io* io);
typedef void (*VP8IoTeardownHook)(const VP8Io* io);
struct VP8Io {
// set by VP8GetHeaders()
int width, height; // picture dimensions, in pixels (invariable).
// These are the original, uncropped dimensions.
// The actual area passed to put() is stored
// in mb_w / mb_h fields.
// set before calling put()
int mb_y; // position of the current rows (in pixels)
int mb_w; // number of columns in the sample
int mb_h; // number of rows in the sample
const uint8_t* y, *u, *v; // rows to copy (in yuv420 format)
int y_stride; // row stride for luma
int uv_stride; // row stride for chroma
void* opaque; // user data
// called when fresh samples are available. Currently, samples are in
// YUV420 format, and can be up to width x 24 in size (depending on the
// in-loop filtering level, e.g.). Should return false in case of error
// or abort request. The actual size of the area to update is mb_w x mb_h
// in size, taking cropping into account.
VP8IoPutHook put;
// called just before starting to decode the blocks.
// Must return false in case of setup error, true otherwise. If false is
// returned, teardown() will NOT be called. But if the setup succeeded
// and true is returned, then teardown() will always be called afterward.
VP8IoSetupHook setup;
// Called just after block decoding is finished (or when an error occurred
// during put()). Is NOT called if setup() failed.
VP8IoTeardownHook teardown;
// this is a recommendation for the user-side yuv->rgb converter. This flag
// is set when calling setup() hook and can be overwritten by it. It then
// can be taken into consideration during the put() method.
int fancy_upsampling;
// Input buffer.
size_t data_size;
const uint8_t* data;
// If true, in-loop filtering will not be performed even if present in the
// bitstream. Switching off filtering may speed up decoding at the expense
// of more visible blocking. Note that output will also be non-compliant
// with the VP8 specifications.
int bypass_filtering;
// Cropping parameters.
int use_cropping;
int crop_left, crop_right, crop_top, crop_bottom;
// Scaling parameters.
int use_scaling;
int scaled_width, scaled_height;
// If non NULL, pointer to the alpha data (if present) corresponding to the
// start of the current row (That is: it is pre-offset by mb_y and takes
// cropping into account).
const uint8_t* a;
};
// Internal, version-checked, entry point
int VP8InitIoInternal(VP8Io* const, int);
// Set the custom IO function pointers and user-data. The setter for IO hooks
// should be called before initiating incremental decoding. Returns true if
// WebPIDecoder object is successfully modified, false otherwise.
int WebPISetIOHooks(WebPIDecoder* const idec,
VP8IoPutHook put,
VP8IoSetupHook setup,
VP8IoTeardownHook teardown,
void* user_data);
// Main decoding object. This is an opaque structure.
typedef struct VP8Decoder VP8Decoder;
// Create a new decoder object.
VP8Decoder* VP8New(void);
// Must be called to make sure 'io' is initialized properly.
// Returns false in case of version mismatch. Upon such failure, no other
// decoding function should be called (VP8Decode, VP8GetHeaders, ...)
static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
}
// Decode the VP8 frame header. Returns true if ok.
// Note: 'io->data' must be pointing to the start of the VP8 frame header.
int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
// Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
// Returns false in case of error.
int VP8Decode(VP8Decoder* const dec, VP8Io* const io);
// Return current status of the decoder:
VP8StatusCode VP8Status(VP8Decoder* const dec);
// return readable string corresponding to the last status.
const char* VP8StatusMessage(VP8Decoder* const dec);
// Resets the decoder in its initial state, reclaiming memory.
// Not a mandatory call between calls to VP8Decode().
void VP8Clear(VP8Decoder* const dec);
// Destroy the decoder object.
void VP8Delete(VP8Decoder* const dec);
//------------------------------------------------------------------------------
// Miscellaneous VP8/VP8L bitstream probing functions.
// Returns true if the next 3 bytes in data contain the VP8 signature.
WEBP_EXTERN int VP8CheckSignature(const uint8_t* const data, size_t data_size);
// Validates the VP8 data-header and retrieves basic header information viz
// width and height. Returns 0 in case of formatting error. *width/*height
// can be passed NULL.
WEBP_EXTERN int VP8GetInfo(
const uint8_t* data,
size_t data_size, // data available so far
size_t chunk_size, // total data size expected in the chunk
int* const width, int* const height);
// Returns true if the next byte(s) in data is a VP8L signature.
WEBP_EXTERN int VP8LCheckSignature(const uint8_t* const data, size_t size);
// Validates the VP8L data-header and retrieves basic header information viz
// width, height and alpha. Returns 0 in case of formatting error.
// width/height/has_alpha can be passed NULL.
WEBP_EXTERN int VP8LGetInfo(
const uint8_t* data, size_t data_size, // data available so far
int* const width, int* const height, int* const has_alpha);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DEC_VP8_DEC_H_

View File

@ -0,0 +1,319 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// VP8 decoder: internal header.
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DEC_VP8I_DEC_H_
#define WEBP_DEC_VP8I_DEC_H_
#include <string.h> // for memcpy()
#include "dec_common_dec.h"
#include "dec_vp8li_dec.h"
#include "utils_bit_reader_utils.h"
#include "utils_random_utils.h"
#include "utils_thread_utils.h"
#include "dsp_dsp.h"
#ifdef __cplusplus
extern "C" {
#endif
//------------------------------------------------------------------------------
// Various defines and enums
// version numbers
#define DEC_MAJ_VERSION 1
#define DEC_MIN_VERSION 2
#define DEC_REV_VERSION 0
// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
// Constraints are: We need to store one 16x16 block of luma samples (y),
// and two 8x8 chroma blocks (u/v). These are better be 16-bytes aligned,
// in order to be SIMD-friendly. We also need to store the top, left and
// top-left samples (from previously decoded blocks), along with four
// extra top-right samples for luma (intra4x4 prediction only).
// One possible layout is, using 32 * (17 + 9) bytes:
//
// .+------ <- only 1 pixel high
// .|yyyyt.
// .|yyyyt.
// .|yyyyt.
// .|yyyy..
// .+--.+-- <- only 1 pixel high
// .|uu.|vv
// .|uu.|vv
//
// Every character is a 4x4 block, with legend:
// '.' = unused
// 'y' = y-samples 'u' = u-samples 'v' = u-samples
// '|' = left sample, '-' = top sample, '+' = top-left sample
// 't' = extra top-right sample for 4x4 modes
#define YUV_SIZE (BPS * 17 + BPS * 9)
#define Y_OFF (BPS * 1 + 8)
#define U_OFF (Y_OFF + BPS * 16 + BPS)
#define V_OFF (U_OFF + 16)
// minimal width under which lossy multi-threading is always disabled
#define MIN_WIDTH_FOR_THREADS 512
//------------------------------------------------------------------------------
// Headers
typedef struct {
uint8_t key_frame_;
uint8_t profile_;
uint8_t show_;
uint32_t partition_length_;
} VP8FrameHeader;
typedef struct {
uint16_t width_;
uint16_t height_;
uint8_t xscale_;
uint8_t yscale_;
uint8_t colorspace_; // 0 = YCbCr
uint8_t clamp_type_;
} VP8PictureHeader;
// segment features
typedef struct {
int use_segment_;
int update_map_; // whether to update the segment map or not
int absolute_delta_; // absolute or delta values for quantizer and filter
int8_t quantizer_[NUM_MB_SEGMENTS]; // quantization changes
int8_t filter_strength_[NUM_MB_SEGMENTS]; // filter strength for segments
} VP8SegmentHeader;
// probas associated to one of the contexts
typedef uint8_t VP8ProbaArray[NUM_PROBAS];
typedef struct { // all the probas associated to one band
VP8ProbaArray probas_[NUM_CTX];
} VP8BandProbas;
// Struct collecting all frame-persistent probabilities.
typedef struct {
uint8_t segments_[MB_FEATURE_TREE_PROBS];
// Type: 0:Intra16-AC 1:Intra16-DC 2:Chroma 3:Intra4
VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
const VP8BandProbas* bands_ptr_[NUM_TYPES][16 + 1];
} VP8Proba;
// Filter parameters
typedef struct {
int simple_; // 0=complex, 1=simple
int level_; // [0..63]
int sharpness_; // [0..7]
int use_lf_delta_;
int ref_lf_delta_[NUM_REF_LF_DELTAS];
int mode_lf_delta_[NUM_MODE_LF_DELTAS];
} VP8FilterHeader;
//------------------------------------------------------------------------------
// Informations about the macroblocks.
typedef struct { // filter specs
uint8_t f_limit_; // filter limit in [3..189], or 0 if no filtering
uint8_t f_ilevel_; // inner limit in [1..63]
uint8_t f_inner_; // do inner filtering?
uint8_t hev_thresh_; // high edge variance threshold in [0..2]
} VP8FInfo;
typedef struct { // Top/Left Contexts used for syntax-parsing
uint8_t nz_; // non-zero AC/DC coeffs (4bit for luma + 4bit for chroma)
uint8_t nz_dc_; // non-zero DC coeff (1bit)
} VP8MB;
// Dequantization matrices
typedef int quant_t[2]; // [DC / AC]. Can be 'uint16_t[2]' too (~slower).
typedef struct {
quant_t y1_mat_, y2_mat_, uv_mat_;
int uv_quant_; // U/V quantizer value
int dither_; // dithering amplitude (0 = off, max=255)
} VP8QuantMatrix;
// Data needed to reconstruct a macroblock
typedef struct {
int16_t coeffs_[384]; // 384 coeffs = (16+4+4) * 4*4
uint8_t is_i4x4_; // true if intra4x4
uint8_t imodes_[16]; // one 16x16 mode (#0) or sixteen 4x4 modes
uint8_t uvmode_; // chroma prediction mode
// bit-wise info about the content of each sub-4x4 blocks (in decoding order).
// Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
// code=0 -> no coefficient
// code=1 -> only DC
// code=2 -> first three coefficients are non-zero
// code=3 -> more than three coefficients are non-zero
// This allows to call specialized transform functions.
uint32_t non_zero_y_;
uint32_t non_zero_uv_;
uint8_t dither_; // local dithering strength (deduced from non_zero_*)
uint8_t skip_;
uint8_t segment_;
} VP8MBData;
// Persistent information needed by the parallel processing
typedef struct {
int id_; // cache row to process (in [0..2])
int mb_y_; // macroblock position of the row
int filter_row_; // true if row-filtering is needed
VP8FInfo* f_info_; // filter strengths (swapped with dec->f_info_)
VP8MBData* mb_data_; // reconstruction data (swapped with dec->mb_data_)
VP8Io io_; // copy of the VP8Io to pass to put()
} VP8ThreadContext;
// Saved top samples, per macroblock. Fits into a cache-line.
typedef struct {
uint8_t y[16], u[8], v[8];
} VP8TopSamples;
//------------------------------------------------------------------------------
// VP8Decoder: the main opaque structure handed over to user
struct VP8Decoder {
VP8StatusCode status_;
int ready_; // true if ready to decode a picture with VP8Decode()
const char* error_msg_; // set when status_ is not OK.
// Main data source
VP8BitReader br_;
// headers
VP8FrameHeader frm_hdr_;
VP8PictureHeader pic_hdr_;
VP8FilterHeader filter_hdr_;
VP8SegmentHeader segment_hdr_;
// Worker
WebPWorker worker_;
int mt_method_; // multi-thread method: 0=off, 1=[parse+recon][filter]
// 2=[parse][recon+filter]
int cache_id_; // current cache row
int num_caches_; // number of cached rows of 16 pixels (1, 2 or 3)
VP8ThreadContext thread_ctx_; // Thread context
// dimension, in macroblock units.
int mb_w_, mb_h_;
// Macroblock to process/filter, depending on cropping and filter_type.
int tl_mb_x_, tl_mb_y_; // top-left MB that must be in-loop filtered
int br_mb_x_, br_mb_y_; // last bottom-right MB that must be decoded
// number of partitions minus one.
uint32_t num_parts_minus_one_;
// per-partition boolean decoders.
VP8BitReader parts_[MAX_NUM_PARTITIONS];
// Dithering strength, deduced from decoding options
int dither_; // whether to use dithering or not
VP8Random dithering_rg_; // random generator for dithering
// dequantization (one set of DC/AC dequant factor per segment)
VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
// probabilities
VP8Proba proba_;
int use_skip_proba_;
uint8_t skip_p_;
// Boundary data cache and persistent buffers.
uint8_t* intra_t_; // top intra modes values: 4 * mb_w_
uint8_t intra_l_[4]; // left intra modes values
VP8TopSamples* yuv_t_; // top y/u/v samples
VP8MB* mb_info_; // contextual macroblock info (mb_w_ + 1)
VP8FInfo* f_info_; // filter strength info
uint8_t* yuv_b_; // main block for Y/U/V (size = YUV_SIZE)
uint8_t* cache_y_; // macroblock row for storing unfiltered samples
uint8_t* cache_u_;
uint8_t* cache_v_;
int cache_y_stride_;
int cache_uv_stride_;
// main memory chunk for the above data. Persistent.
void* mem_;
size_t mem_size_;
// Per macroblock non-persistent infos.
int mb_x_, mb_y_; // current position, in macroblock units
VP8MBData* mb_data_; // parsed reconstruction data
// Filtering side-info
int filter_type_; // 0=off, 1=simple, 2=complex
VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2]; // precalculated per-segment/type
// Alpha
struct ALPHDecoder* alph_dec_; // alpha-plane decoder object
const uint8_t* alpha_data_; // compressed alpha data (if present)
size_t alpha_data_size_;
int is_alpha_decoded_; // true if alpha_data_ is decoded in alpha_plane_
uint8_t* alpha_plane_mem_; // memory allocated for alpha_plane_
uint8_t* alpha_plane_; // output. Persistent, contains the whole data.
const uint8_t* alpha_prev_line_; // last decoded alpha row (or NULL)
int alpha_dithering_; // derived from decoding options (0=off, 100=full)
};
//------------------------------------------------------------------------------
// internal functions. Not public.
// in vp8.c
int VP8SetError(VP8Decoder* const dec,
VP8StatusCode error, const char* const msg);
// in tree.c
void VP8ResetProba(VP8Proba* const proba);
void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec);
// parses one row of intra mode data in partition 0, returns !eof
int VP8ParseIntraModeRow(VP8BitReader* const br, VP8Decoder* const dec);
// in quant.c
void VP8ParseQuant(VP8Decoder* const dec);
// in frame.c
int VP8InitFrame(VP8Decoder* const dec, VP8Io* const io);
// Call io->setup() and finish setting up scan parameters.
// After this call returns, one must always call VP8ExitCritical() with the
// same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
// if ok, otherwise sets and returns the error status on *dec.
VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
// Must always be called in pair with VP8EnterCritical().
// Returns false in case of error.
int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
// Return the multi-threading method to use (0=off), depending
// on options and bitstream size. Only for lossy decoding.
int VP8GetThreadMethod(const WebPDecoderOptions* const options,
const WebPHeaderStructure* const headers,
int width, int height);
// Initialize dithering post-process if needed.
void VP8InitDithering(const WebPDecoderOptions* const options,
VP8Decoder* const dec);
// Process the last decoded row (filtering + output).
int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
// To be called at the start of a new scanline, to initialize predictors.
void VP8InitScanline(VP8Decoder* const dec);
// Decode one macroblock. Returns false if there is not enough data.
int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br);
// in alpha.c
const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
const VP8Io* const io,
int row, int num_rows);
//------------------------------------------------------------------------------
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DEC_VP8I_DEC_H_

1738
vendor/github.com/sizeofint/webpanimation/dec_vp8l_dec.c generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,135 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Lossless decoder: internal header.
//
// Author: Skal (pascal.massimino@gmail.com)
// Vikas Arora(vikaas.arora@gmail.com)
#ifndef WEBP_DEC_VP8LI_DEC_H_
#define WEBP_DEC_VP8LI_DEC_H_
#include <string.h> // for memcpy()
#include "dec_webpi_dec.h"
#include "utils_bit_reader_utils.h"
#include "utils_color_cache_utils.h"
#include "utils_huffman_utils.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef enum {
READ_DATA = 0,
READ_HDR = 1,
READ_DIM = 2
} VP8LDecodeState;
typedef struct VP8LTransform VP8LTransform;
struct VP8LTransform {
VP8LImageTransformType type_; // transform type.
int bits_; // subsampling bits defining transform window.
int xsize_; // transform window X index.
int ysize_; // transform window Y index.
uint32_t* data_; // transform data.
};
typedef struct {
int color_cache_size_;
VP8LColorCache color_cache_;
VP8LColorCache saved_color_cache_; // for incremental
int huffman_mask_;
int huffman_subsample_bits_;
int huffman_xsize_;
uint32_t* huffman_image_;
int num_htree_groups_;
HTreeGroup* htree_groups_;
HuffmanCode* huffman_tables_;
} VP8LMetadata;
typedef struct VP8LDecoder VP8LDecoder;
struct VP8LDecoder {
VP8StatusCode status_;
VP8LDecodeState state_;
VP8Io* io_;
const WebPDecBuffer* output_; // shortcut to io->opaque->output
uint32_t* pixels_; // Internal data: either uint8_t* for alpha
// or uint32_t* for BGRA.
uint32_t* argb_cache_; // Scratch buffer for temporary BGRA storage.
VP8LBitReader br_;
int incremental_; // if true, incremental decoding is expected
VP8LBitReader saved_br_; // note: could be local variables too
int saved_last_pixel_;
int width_;
int height_;
int last_row_; // last input row decoded so far.
int last_pixel_; // last pixel decoded so far. However, it may
// not be transformed, scaled and
// color-converted yet.
int last_out_row_; // last row output so far.
VP8LMetadata hdr_;
int next_transform_;
VP8LTransform transforms_[NUM_TRANSFORMS];
// or'd bitset storing the transforms types.
uint32_t transforms_seen_;
uint8_t* rescaler_memory; // Working memory for rescaling work.
WebPRescaler* rescaler; // Common rescaler for all channels.
};
//------------------------------------------------------------------------------
// internal functions. Not public.
struct ALPHDecoder; // Defined in dec/alphai.h.
// in vp8l.c
// Decodes image header for alpha data stored using lossless compression.
// Returns false in case of error.
int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
const uint8_t* const data, size_t data_size);
// Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
// already decoded in previous call(s), it will resume decoding from where it
// was paused.
// Returns false in case of bitstream error.
int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
int last_row);
// Allocates and initialize a new lossless decoder instance.
VP8LDecoder* VP8LNew(void);
// Decodes the image header. Returns false in case of error.
int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io);
// Decodes an image. It's required to decode the lossless header before calling
// this function. Returns false in case of error, with updated dec->status_.
int VP8LDecodeImage(VP8LDecoder* const dec);
// Resets the decoder in its initial state, reclaiming memory.
// Preserves the dec->status_ value.
void VP8LClear(VP8LDecoder* const dec);
// Clears and deallocate a lossless decoder instance.
void VP8LDelete(VP8LDecoder* const dec);
//------------------------------------------------------------------------------
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DEC_VP8LI_DEC_H_

View File

@ -0,0 +1,845 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Main decoding functions for WEBP images.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include "dec_vp8i_dec.h"
#include "dec_vp8li_dec.h"
#include "dec_webpi_dec.h"
#include "utils_utils.h"
#include "webp_mux_types.h"
//------------------------------------------------------------------------------
// RIFF layout is:
// Offset tag
// 0...3 "RIFF" 4-byte tag
// 4...7 size of image data (including metadata) starting at offset 8
// 8...11 "WEBP" our form-type signature
// The RIFF container (12 bytes) is followed by appropriate chunks:
// 12..15 "VP8 ": 4-bytes tags, signaling the use of VP8 video format
// 16..19 size of the raw VP8 image data, starting at offset 20
// 20.... the VP8 bytes
// Or,
// 12..15 "VP8L": 4-bytes tags, signaling the use of VP8L lossless format
// 16..19 size of the raw VP8L image data, starting at offset 20
// 20.... the VP8L bytes
// Or,
// 12..15 "VP8X": 4-bytes tags, describing the extended-VP8 chunk.
// 16..19 size of the VP8X chunk starting at offset 20.
// 20..23 VP8X flags bit-map corresponding to the chunk-types present.
// 24..26 Width of the Canvas Image.
// 27..29 Height of the Canvas Image.
// There can be extra chunks after the "VP8X" chunk (ICCP, ANMF, VP8, VP8L,
// XMP, EXIF ...)
// All sizes are in little-endian order.
// Note: chunk data size must be padded to multiple of 2 when written.
// Validates the RIFF container (if detected) and skips over it.
// If a RIFF container is detected, returns:
// VP8_STATUS_BITSTREAM_ERROR for invalid header,
// VP8_STATUS_NOT_ENOUGH_DATA for truncated data if have_all_data is true,
// and VP8_STATUS_OK otherwise.
// In case there are not enough bytes (partial RIFF container), return 0 for
// *riff_size. Else return the RIFF size extracted from the header.
static VP8StatusCode ParseRIFF(const uint8_t** const data,
size_t* const data_size, int have_all_data,
size_t* const riff_size) {
assert(data != NULL);
assert(data_size != NULL);
assert(riff_size != NULL);
*riff_size = 0; // Default: no RIFF present.
if (*data_size >= RIFF_HEADER_SIZE && !memcmp(*data, "RIFF", TAG_SIZE)) {
if (memcmp(*data + 8, "WEBP", TAG_SIZE)) {
return VP8_STATUS_BITSTREAM_ERROR; // Wrong image file signature.
} else {
const uint32_t size = GetLE32(*data + TAG_SIZE);
// Check that we have at least one chunk (i.e "WEBP" + "VP8?nnnn").
if (size < TAG_SIZE + CHUNK_HEADER_SIZE) {
return VP8_STATUS_BITSTREAM_ERROR;
}
if (size > MAX_CHUNK_PAYLOAD) {
return VP8_STATUS_BITSTREAM_ERROR;
}
if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
return VP8_STATUS_NOT_ENOUGH_DATA; // Truncated bitstream.
}
// We have a RIFF container. Skip it.
*riff_size = size;
*data += RIFF_HEADER_SIZE;
*data_size -= RIFF_HEADER_SIZE;
}
}
return VP8_STATUS_OK;
}
// Validates the VP8X header and skips over it.
// Returns VP8_STATUS_BITSTREAM_ERROR for invalid VP8X header,
// VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
// VP8_STATUS_OK otherwise.
// If a VP8X chunk is found, found_vp8x is set to true and *width_ptr,
// *height_ptr and *flags_ptr are set to the corresponding values extracted
// from the VP8X chunk.
static VP8StatusCode ParseVP8X(const uint8_t** const data,
size_t* const data_size,
int* const found_vp8x,
int* const width_ptr, int* const height_ptr,
uint32_t* const flags_ptr) {
const uint32_t vp8x_size = CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE;
assert(data != NULL);
assert(data_size != NULL);
assert(found_vp8x != NULL);
*found_vp8x = 0;
if (*data_size < CHUNK_HEADER_SIZE) {
return VP8_STATUS_NOT_ENOUGH_DATA; // Insufficient data.
}
if (!memcmp(*data, "VP8X", TAG_SIZE)) {
int width, height;
uint32_t flags;
const uint32_t chunk_size = GetLE32(*data + TAG_SIZE);
if (chunk_size != VP8X_CHUNK_SIZE) {
return VP8_STATUS_BITSTREAM_ERROR; // Wrong chunk size.
}
// Verify if enough data is available to validate the VP8X chunk.
if (*data_size < vp8x_size) {
return VP8_STATUS_NOT_ENOUGH_DATA; // Insufficient data.
}
flags = GetLE32(*data + 8);
width = 1 + GetLE24(*data + 12);
height = 1 + GetLE24(*data + 15);
if (width * (uint64_t)height >= MAX_IMAGE_AREA) {
return VP8_STATUS_BITSTREAM_ERROR; // image is too large
}
if (flags_ptr != NULL) *flags_ptr = flags;
if (width_ptr != NULL) *width_ptr = width;
if (height_ptr != NULL) *height_ptr = height;
// Skip over VP8X header bytes.
*data += vp8x_size;
*data_size -= vp8x_size;
*found_vp8x = 1;
}
return VP8_STATUS_OK;
}
// Skips to the next VP8/VP8L chunk header in the data given the size of the
// RIFF chunk 'riff_size'.
// Returns VP8_STATUS_BITSTREAM_ERROR if any invalid chunk size is encountered,
// VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
// VP8_STATUS_OK otherwise.
// If an alpha chunk is found, *alpha_data and *alpha_size are set
// appropriately.
static VP8StatusCode ParseOptionalChunks(const uint8_t** const data,
size_t* const data_size,
size_t const riff_size,
const uint8_t** const alpha_data,
size_t* const alpha_size) {
const uint8_t* buf;
size_t buf_size;
uint32_t total_size = TAG_SIZE + // "WEBP".
CHUNK_HEADER_SIZE + // "VP8Xnnnn".
VP8X_CHUNK_SIZE; // data.
assert(data != NULL);
assert(data_size != NULL);
buf = *data;
buf_size = *data_size;
assert(alpha_data != NULL);
assert(alpha_size != NULL);
*alpha_data = NULL;
*alpha_size = 0;
while (1) {
uint32_t chunk_size;
uint32_t disk_chunk_size; // chunk_size with padding
*data = buf;
*data_size = buf_size;
if (buf_size < CHUNK_HEADER_SIZE) { // Insufficient data.
return VP8_STATUS_NOT_ENOUGH_DATA;
}
chunk_size = GetLE32(buf + TAG_SIZE);
if (chunk_size > MAX_CHUNK_PAYLOAD) {
return VP8_STATUS_BITSTREAM_ERROR; // Not a valid chunk size.
}
// For odd-sized chunk-payload, there's one byte padding at the end.
disk_chunk_size = (CHUNK_HEADER_SIZE + chunk_size + 1) & ~1;
total_size += disk_chunk_size;
// Check that total bytes skipped so far does not exceed riff_size.
if (riff_size > 0 && (total_size > riff_size)) {
return VP8_STATUS_BITSTREAM_ERROR; // Not a valid chunk size.
}
// Start of a (possibly incomplete) VP8/VP8L chunk implies that we have
// parsed all the optional chunks.
// Note: This check must occur before the check 'buf_size < disk_chunk_size'
// below to allow incomplete VP8/VP8L chunks.
if (!memcmp(buf, "VP8 ", TAG_SIZE) ||
!memcmp(buf, "VP8L", TAG_SIZE)) {
return VP8_STATUS_OK;
}
if (buf_size < disk_chunk_size) { // Insufficient data.
return VP8_STATUS_NOT_ENOUGH_DATA;
}
if (!memcmp(buf, "ALPH", TAG_SIZE)) { // A valid ALPH header.
*alpha_data = buf + CHUNK_HEADER_SIZE;
*alpha_size = chunk_size;
}
// We have a full and valid chunk; skip it.
buf += disk_chunk_size;
buf_size -= disk_chunk_size;
}
}
// Validates the VP8/VP8L Header ("VP8 nnnn" or "VP8L nnnn") and skips over it.
// Returns VP8_STATUS_BITSTREAM_ERROR for invalid (chunk larger than
// riff_size) VP8/VP8L header,
// VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and
// VP8_STATUS_OK otherwise.
// If a VP8/VP8L chunk is found, *chunk_size is set to the total number of bytes
// extracted from the VP8/VP8L chunk header.
// The flag '*is_lossless' is set to 1 in case of VP8L chunk / raw VP8L data.
static VP8StatusCode ParseVP8Header(const uint8_t** const data_ptr,
size_t* const data_size, int have_all_data,
size_t riff_size, size_t* const chunk_size,
int* const is_lossless) {
const uint8_t* const data = *data_ptr;
const int is_vp8 = !memcmp(data, "VP8 ", TAG_SIZE);
const int is_vp8l = !memcmp(data, "VP8L", TAG_SIZE);
const uint32_t minimal_size =
TAG_SIZE + CHUNK_HEADER_SIZE; // "WEBP" + "VP8 nnnn" OR
// "WEBP" + "VP8Lnnnn"
assert(data != NULL);
assert(data_size != NULL);
assert(chunk_size != NULL);
assert(is_lossless != NULL);
if (*data_size < CHUNK_HEADER_SIZE) {
return VP8_STATUS_NOT_ENOUGH_DATA; // Insufficient data.
}
if (is_vp8 || is_vp8l) {
// Bitstream contains VP8/VP8L header.
const uint32_t size = GetLE32(data + TAG_SIZE);
if ((riff_size >= minimal_size) && (size > riff_size - minimal_size)) {
return VP8_STATUS_BITSTREAM_ERROR; // Inconsistent size information.
}
if (have_all_data && (size > *data_size - CHUNK_HEADER_SIZE)) {
return VP8_STATUS_NOT_ENOUGH_DATA; // Truncated bitstream.
}
// Skip over CHUNK_HEADER_SIZE bytes from VP8/VP8L Header.
*chunk_size = size;
*data_ptr += CHUNK_HEADER_SIZE;
*data_size -= CHUNK_HEADER_SIZE;
*is_lossless = is_vp8l;
} else {
// Raw VP8/VP8L bitstream (no header).
*is_lossless = VP8LCheckSignature(data, *data_size);
*chunk_size = *data_size;
}
return VP8_STATUS_OK;
}
//------------------------------------------------------------------------------
// Fetch '*width', '*height', '*has_alpha' and fill out 'headers' based on
// 'data'. All the output parameters may be NULL. If 'headers' is NULL only the
// minimal amount will be read to fetch the remaining parameters.
// If 'headers' is non-NULL this function will attempt to locate both alpha
// data (with or without a VP8X chunk) and the bitstream chunk (VP8/VP8L).
// Note: The following chunk sequences (before the raw VP8/VP8L data) are
// considered valid by this function:
// RIFF + VP8(L)
// RIFF + VP8X + (optional chunks) + VP8(L)
// ALPH + VP8 <-- Not a valid WebP format: only allowed for internal purpose.
// VP8(L) <-- Not a valid WebP format: only allowed for internal purpose.
static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
size_t data_size,
int* const width,
int* const height,
int* const has_alpha,
int* const has_animation,
int* const format,
WebPHeaderStructure* const headers) {
int canvas_width = 0;
int canvas_height = 0;
int image_width = 0;
int image_height = 0;
int found_riff = 0;
int found_vp8x = 0;
int animation_present = 0;
const int have_all_data = (headers != NULL) ? headers->have_all_data : 0;
VP8StatusCode status;
WebPHeaderStructure hdrs;
if (data == NULL || data_size < RIFF_HEADER_SIZE) {
return VP8_STATUS_NOT_ENOUGH_DATA;
}
memset(&hdrs, 0, sizeof(hdrs));
hdrs.data = data;
hdrs.data_size = data_size;
// Skip over RIFF header.
status = ParseRIFF(&data, &data_size, have_all_data, &hdrs.riff_size);
if (status != VP8_STATUS_OK) {
return status; // Wrong RIFF header / insufficient data.
}
found_riff = (hdrs.riff_size > 0);
// Skip over VP8X.
{
uint32_t flags = 0;
status = ParseVP8X(&data, &data_size, &found_vp8x,
&canvas_width, &canvas_height, &flags);
if (status != VP8_STATUS_OK) {
return status; // Wrong VP8X / insufficient data.
}
animation_present = !!(flags & ANIMATION_FLAG);
if (!found_riff && found_vp8x) {
// Note: This restriction may be removed in the future, if it becomes
// necessary to send VP8X chunk to the decoder.
return VP8_STATUS_BITSTREAM_ERROR;
}
if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
if (has_animation != NULL) *has_animation = animation_present;
if (format != NULL) *format = 0; // default = undefined
image_width = canvas_width;
image_height = canvas_height;
if (found_vp8x && animation_present && headers == NULL) {
status = VP8_STATUS_OK;
goto ReturnWidthHeight; // Just return features from VP8X header.
}
}
if (data_size < TAG_SIZE) {
status = VP8_STATUS_NOT_ENOUGH_DATA;
goto ReturnWidthHeight;
}
// Skip over optional chunks if data started with "RIFF + VP8X" or "ALPH".
if ((found_riff && found_vp8x) ||
(!found_riff && !found_vp8x && !memcmp(data, "ALPH", TAG_SIZE))) {
status = ParseOptionalChunks(&data, &data_size, hdrs.riff_size,
&hdrs.alpha_data, &hdrs.alpha_data_size);
if (status != VP8_STATUS_OK) {
goto ReturnWidthHeight; // Invalid chunk size / insufficient data.
}
}
// Skip over VP8/VP8L header.
status = ParseVP8Header(&data, &data_size, have_all_data, hdrs.riff_size,
&hdrs.compressed_size, &hdrs.is_lossless);
if (status != VP8_STATUS_OK) {
goto ReturnWidthHeight; // Wrong VP8/VP8L chunk-header / insufficient data.
}
if (hdrs.compressed_size > MAX_CHUNK_PAYLOAD) {
return VP8_STATUS_BITSTREAM_ERROR;
}
if (format != NULL && !animation_present) {
*format = hdrs.is_lossless ? 2 : 1;
}
if (!hdrs.is_lossless) {
if (data_size < VP8_FRAME_HEADER_SIZE) {
status = VP8_STATUS_NOT_ENOUGH_DATA;
goto ReturnWidthHeight;
}
// Validates raw VP8 data.
if (!VP8GetInfo(data, data_size, (uint32_t)hdrs.compressed_size,
&image_width, &image_height)) {
return VP8_STATUS_BITSTREAM_ERROR;
}
} else {
if (data_size < VP8L_FRAME_HEADER_SIZE) {
status = VP8_STATUS_NOT_ENOUGH_DATA;
goto ReturnWidthHeight;
}
// Validates raw VP8L data.
if (!VP8LGetInfo(data, data_size, &image_width, &image_height, has_alpha)) {
return VP8_STATUS_BITSTREAM_ERROR;
}
}
// Validates image size coherency.
if (found_vp8x) {
if (canvas_width != image_width || canvas_height != image_height) {
return VP8_STATUS_BITSTREAM_ERROR;
}
}
if (headers != NULL) {
*headers = hdrs;
headers->offset = data - headers->data;
assert((uint64_t)(data - headers->data) < MAX_CHUNK_PAYLOAD);
assert(headers->offset == headers->data_size - data_size);
}
ReturnWidthHeight:
if (status == VP8_STATUS_OK ||
(status == VP8_STATUS_NOT_ENOUGH_DATA && found_vp8x && headers == NULL)) {
if (has_alpha != NULL) {
// If the data did not contain a VP8X/VP8L chunk the only definitive way
// to set this is by looking for alpha data (from an ALPH chunk).
*has_alpha |= (hdrs.alpha_data != NULL);
}
if (width != NULL) *width = image_width;
if (height != NULL) *height = image_height;
return VP8_STATUS_OK;
} else {
return status;
}
}
VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
// status is marked volatile as a workaround for a clang-3.8 (aarch64) bug
volatile VP8StatusCode status;
int has_animation = 0;
assert(headers != NULL);
// fill out headers, ignore width/height/has_alpha.
status = ParseHeadersInternal(headers->data, headers->data_size,
NULL, NULL, NULL, &has_animation,
NULL, headers);
if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
// The WebPDemux API + libwebp can be used to decode individual
// uncomposited frames or the WebPAnimDecoder can be used to fully
// reconstruct them (see webp/demux.h).
if (has_animation) {
status = VP8_STATUS_UNSUPPORTED_FEATURE;
}
}
return status;
}
//------------------------------------------------------------------------------
// WebPDecParams
void WebPResetDecParams(WebPDecParams* const params) {
if (params != NULL) {
memset(params, 0, sizeof(*params));
}
}
//------------------------------------------------------------------------------
// "Into" decoding variants
// Main flow
static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
WebPDecParams* const params) {
VP8StatusCode status;
VP8Io io;
WebPHeaderStructure headers;
headers.data = data;
headers.data_size = data_size;
headers.have_all_data = 1;
status = WebPParseHeaders(&headers); // Process Pre-VP8 chunks.
if (status != VP8_STATUS_OK) {
return status;
}
assert(params != NULL);
VP8InitIo(&io);
io.data = headers.data + headers.offset;
io.data_size = headers.data_size - headers.offset;
WebPInitCustomIo(params, &io); // Plug the I/O functions.
if (!headers.is_lossless) {
VP8Decoder* const dec = VP8New();
if (dec == NULL) {
return VP8_STATUS_OUT_OF_MEMORY;
}
dec->alpha_data_ = headers.alpha_data;
dec->alpha_data_size_ = headers.alpha_data_size;
// Decode bitstream header, update io->width/io->height.
if (!VP8GetHeaders(dec, &io)) {
status = dec->status_; // An error occurred. Grab error status.
} else {
// Allocate/check output buffers.
status = WebPAllocateDecBuffer(io.width, io.height, params->options,
params->output);
if (status == VP8_STATUS_OK) { // Decode
// This change must be done before calling VP8Decode()
dec->mt_method_ = VP8GetThreadMethod(params->options, &headers,
io.width, io.height);
VP8InitDithering(params->options, dec);
if (!VP8Decode(dec, &io)) {
status = dec->status_;
}
}
}
VP8Delete(dec);
} else {
VP8LDecoder* const dec = VP8LNew();
if (dec == NULL) {
return VP8_STATUS_OUT_OF_MEMORY;
}
if (!VP8LDecodeHeader(dec, &io)) {
status = dec->status_; // An error occurred. Grab error status.
} else {
// Allocate/check output buffers.
status = WebPAllocateDecBuffer(io.width, io.height, params->options,
params->output);
if (status == VP8_STATUS_OK) { // Decode
if (!VP8LDecodeImage(dec)) {
status = dec->status_;
}
}
}
VP8LDelete(dec);
}
if (status != VP8_STATUS_OK) {
WebPFreeDecBuffer(params->output);
} else {
if (params->options != NULL && params->options->flip) {
// This restores the original stride values if options->flip was used
// during the call to WebPAllocateDecBuffer above.
status = WebPFlipBuffer(params->output);
}
}
return status;
}
// Helpers
static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace,
const uint8_t* const data,
size_t data_size,
uint8_t* const rgba,
int stride, size_t size) {
WebPDecParams params;
WebPDecBuffer buf;
if (rgba == NULL) {
return NULL;
}
WebPInitDecBuffer(&buf);
WebPResetDecParams(&params);
params.output = &buf;
buf.colorspace = colorspace;
buf.u.RGBA.rgba = rgba;
buf.u.RGBA.stride = stride;
buf.u.RGBA.size = size;
buf.is_external_memory = 1;
if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
return NULL;
}
return rgba;
}
uint8_t* WebPDecodeRGBInto(const uint8_t* data, size_t data_size,
uint8_t* output, size_t size, int stride) {
return DecodeIntoRGBABuffer(MODE_RGB, data, data_size, output, stride, size);
}
uint8_t* WebPDecodeRGBAInto(const uint8_t* data, size_t data_size,
uint8_t* output, size_t size, int stride) {
return DecodeIntoRGBABuffer(MODE_RGBA, data, data_size, output, stride, size);
}
uint8_t* WebPDecodeARGBInto(const uint8_t* data, size_t data_size,
uint8_t* output, size_t size, int stride) {
return DecodeIntoRGBABuffer(MODE_ARGB, data, data_size, output, stride, size);
}
uint8_t* WebPDecodeBGRInto(const uint8_t* data, size_t data_size,
uint8_t* output, size_t size, int stride) {
return DecodeIntoRGBABuffer(MODE_BGR, data, data_size, output, stride, size);
}
uint8_t* WebPDecodeBGRAInto(const uint8_t* data, size_t data_size,
uint8_t* output, size_t size, int stride) {
return DecodeIntoRGBABuffer(MODE_BGRA, data, data_size, output, stride, size);
}
uint8_t* WebPDecodeYUVInto(const uint8_t* data, size_t data_size,
uint8_t* luma, size_t luma_size, int luma_stride,
uint8_t* u, size_t u_size, int u_stride,
uint8_t* v, size_t v_size, int v_stride) {
WebPDecParams params;
WebPDecBuffer output;
if (luma == NULL) return NULL;
WebPInitDecBuffer(&output);
WebPResetDecParams(&params);
params.output = &output;
output.colorspace = MODE_YUV;
output.u.YUVA.y = luma;
output.u.YUVA.y_stride = luma_stride;
output.u.YUVA.y_size = luma_size;
output.u.YUVA.u = u;
output.u.YUVA.u_stride = u_stride;
output.u.YUVA.u_size = u_size;
output.u.YUVA.v = v;
output.u.YUVA.v_stride = v_stride;
output.u.YUVA.v_size = v_size;
output.is_external_memory = 1;
if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
return NULL;
}
return luma;
}
//------------------------------------------------------------------------------
static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* const data,
size_t data_size, int* const width, int* const height,
WebPDecBuffer* const keep_info) {
WebPDecParams params;
WebPDecBuffer output;
WebPInitDecBuffer(&output);
WebPResetDecParams(&params);
params.output = &output;
output.colorspace = mode;
// Retrieve (and report back) the required dimensions from bitstream.
if (!WebPGetInfo(data, data_size, &output.width, &output.height)) {
return NULL;
}
if (width != NULL) *width = output.width;
if (height != NULL) *height = output.height;
// Decode
if (DecodeInto(data, data_size, &params) != VP8_STATUS_OK) {
return NULL;
}
if (keep_info != NULL) { // keep track of the side-info
WebPCopyDecBuffer(&output, keep_info);
}
// return decoded samples (don't clear 'output'!)
return WebPIsRGBMode(mode) ? output.u.RGBA.rgba : output.u.YUVA.y;
}
uint8_t* WebPDecodeRGB(const uint8_t* data, size_t data_size,
int* width, int* height) {
return Decode(MODE_RGB, data, data_size, width, height, NULL);
}
uint8_t* WebPDecodeRGBA(const uint8_t* data, size_t data_size,
int* width, int* height) {
return Decode(MODE_RGBA, data, data_size, width, height, NULL);
}
uint8_t* WebPDecodeARGB(const uint8_t* data, size_t data_size,
int* width, int* height) {
return Decode(MODE_ARGB, data, data_size, width, height, NULL);
}
uint8_t* WebPDecodeBGR(const uint8_t* data, size_t data_size,
int* width, int* height) {
return Decode(MODE_BGR, data, data_size, width, height, NULL);
}
uint8_t* WebPDecodeBGRA(const uint8_t* data, size_t data_size,
int* width, int* height) {
return Decode(MODE_BGRA, data, data_size, width, height, NULL);
}
uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
int* width, int* height, uint8_t** u, uint8_t** v,
int* stride, int* uv_stride) {
WebPDecBuffer output; // only to preserve the side-infos
uint8_t* const out = Decode(MODE_YUV, data, data_size,
width, height, &output);
if (out != NULL) {
const WebPYUVABuffer* const buf = &output.u.YUVA;
*u = buf->u;
*v = buf->v;
*stride = buf->y_stride;
*uv_stride = buf->u_stride;
assert(buf->u_stride == buf->v_stride);
}
return out;
}
static void DefaultFeatures(WebPBitstreamFeatures* const features) {
assert(features != NULL);
memset(features, 0, sizeof(*features));
}
static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
WebPBitstreamFeatures* const features) {
if (features == NULL || data == NULL) {
return VP8_STATUS_INVALID_PARAM;
}
DefaultFeatures(features);
// Only parse enough of the data to retrieve the features.
return ParseHeadersInternal(data, data_size,
&features->width, &features->height,
&features->has_alpha, &features->has_animation,
&features->format, NULL);
}
//------------------------------------------------------------------------------
// WebPGetInfo()
int WebPGetInfo(const uint8_t* data, size_t data_size,
int* width, int* height) {
WebPBitstreamFeatures features;
if (GetFeatures(data, data_size, &features) != VP8_STATUS_OK) {
return 0;
}
if (width != NULL) {
*width = features.width;
}
if (height != NULL) {
*height = features.height;
}
return 1;
}
//------------------------------------------------------------------------------
// Advance decoding API
int WebPInitDecoderConfigInternal(WebPDecoderConfig* config,
int version) {
if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
return 0; // version mismatch
}
if (config == NULL) {
return 0;
}
memset(config, 0, sizeof(*config));
DefaultFeatures(&config->input);
WebPInitDecBuffer(&config->output);
return 1;
}
VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, size_t data_size,
WebPBitstreamFeatures* features,
int version) {
if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DECODER_ABI_VERSION)) {
return VP8_STATUS_INVALID_PARAM; // version mismatch
}
if (features == NULL) {
return VP8_STATUS_INVALID_PARAM;
}
return GetFeatures(data, data_size, features);
}
VP8StatusCode WebPDecode(const uint8_t* data, size_t data_size,
WebPDecoderConfig* config) {
WebPDecParams params;
VP8StatusCode status;
if (config == NULL) {
return VP8_STATUS_INVALID_PARAM;
}
status = GetFeatures(data, data_size, &config->input);
if (status != VP8_STATUS_OK) {
if (status == VP8_STATUS_NOT_ENOUGH_DATA) {
return VP8_STATUS_BITSTREAM_ERROR; // Not-enough-data treated as error.
}
return status;
}
WebPResetDecParams(&params);
params.options = &config->options;
params.output = &config->output;
if (WebPAvoidSlowMemory(params.output, &config->input)) {
// decoding to slow memory: use a temporary in-mem buffer to decode into.
WebPDecBuffer in_mem_buffer;
WebPInitDecBuffer(&in_mem_buffer);
in_mem_buffer.colorspace = config->output.colorspace;
in_mem_buffer.width = config->input.width;
in_mem_buffer.height = config->input.height;
params.output = &in_mem_buffer;
status = DecodeInto(data, data_size, &params);
if (status == VP8_STATUS_OK) { // do the slow-copy
status = WebPCopyDecBufferPixels(&in_mem_buffer, &config->output);
}
WebPFreeDecBuffer(&in_mem_buffer);
} else {
status = DecodeInto(data, data_size, &params);
}
return status;
}
//------------------------------------------------------------------------------
// Cropping and rescaling.
int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
VP8Io* const io, WEBP_CSP_MODE src_colorspace) {
const int W = io->width;
const int H = io->height;
int x = 0, y = 0, w = W, h = H;
// Cropping
io->use_cropping = (options != NULL) && (options->use_cropping > 0);
if (io->use_cropping) {
w = options->crop_width;
h = options->crop_height;
x = options->crop_left;
y = options->crop_top;
if (!WebPIsRGBMode(src_colorspace)) { // only snap for YUV420
x &= ~1;
y &= ~1;
}
if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) {
return 0; // out of frame boundary error
}
}
io->crop_left = x;
io->crop_top = y;
io->crop_right = x + w;
io->crop_bottom = y + h;
io->mb_w = w;
io->mb_h = h;
// Scaling
io->use_scaling = (options != NULL) && (options->use_scaling > 0);
if (io->use_scaling) {
int scaled_width = options->scaled_width;
int scaled_height = options->scaled_height;
if (!WebPRescalerGetScaledDimensions(w, h, &scaled_width, &scaled_height)) {
return 0;
}
io->scaled_width = scaled_width;
io->scaled_height = scaled_height;
}
// Filter
io->bypass_filtering = (options != NULL) && options->bypass_filtering;
// Fancy upsampler
#ifdef FANCY_UPSAMPLING
io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling);
#endif
if (io->use_scaling) {
// disable filter (only for large downscaling ratio).
io->bypass_filtering = (io->scaled_width < W * 3 / 4) &&
(io->scaled_height < H * 3 / 4);
io->fancy_upsampling = 0;
}
return 1;
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,133 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Internal header: WebP decoding parameters and custom IO on buffer
//
// Author: somnath@google.com (Somnath Banerjee)
#ifndef WEBP_DEC_WEBPI_DEC_H_
#define WEBP_DEC_WEBPI_DEC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include "utils_rescaler_utils.h"
#include "dec_vp8_dec.h"
//------------------------------------------------------------------------------
// WebPDecParams: Decoding output parameters. Transient internal object.
typedef struct WebPDecParams WebPDecParams;
typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p);
typedef int (*OutputAlphaFunc)(const VP8Io* const io, WebPDecParams* const p,
int expected_num_out_lines);
typedef int (*OutputRowFunc)(WebPDecParams* const p, int y_pos,
int max_out_lines);
struct WebPDecParams {
WebPDecBuffer* output; // output buffer.
uint8_t* tmp_y, *tmp_u, *tmp_v; // cache for the fancy upsampler
// or used for tmp rescaling
int last_y; // coordinate of the line that was last output
const WebPDecoderOptions* options; // if not NULL, use alt decoding features
WebPRescaler* scaler_y, *scaler_u, *scaler_v, *scaler_a; // rescalers
void* memory; // overall scratch memory for the output work.
OutputFunc emit; // output RGB or YUV samples
OutputAlphaFunc emit_alpha; // output alpha channel
OutputRowFunc emit_alpha_row; // output one line of rescaled alpha values
};
// Should be called first, before any use of the WebPDecParams object.
void WebPResetDecParams(WebPDecParams* const params);
//------------------------------------------------------------------------------
// Header parsing helpers
// Structure storing a description of the RIFF headers.
typedef struct {
const uint8_t* data; // input buffer
size_t data_size; // input buffer size
int have_all_data; // true if all data is known to be available
size_t offset; // offset to main data chunk (VP8 or VP8L)
const uint8_t* alpha_data; // points to alpha chunk (if present)
size_t alpha_data_size; // alpha chunk size
size_t compressed_size; // VP8/VP8L compressed data size
size_t riff_size; // size of the riff payload (or 0 if absent)
int is_lossless; // true if a VP8L chunk is present
} WebPHeaderStructure;
// Skips over all valid chunks prior to the first VP8/VP8L frame header.
// Returns: VP8_STATUS_OK, VP8_STATUS_BITSTREAM_ERROR (invalid header/chunk),
// VP8_STATUS_NOT_ENOUGH_DATA (partial input) or VP8_STATUS_UNSUPPORTED_FEATURE
// in the case of non-decodable features (animation for instance).
// In 'headers', compressed_size, offset, alpha_data, alpha_size, and lossless
// fields are updated appropriately upon success.
VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers);
//------------------------------------------------------------------------------
// Misc utils
// Initializes VP8Io with custom setup, io and teardown functions. The default
// hooks will use the supplied 'params' as io->opaque handle.
void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io);
// Setup crop_xxx fields, mb_w and mb_h in io. 'src_colorspace' refers
// to the *compressed* format, not the output one.
int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
VP8Io* const io, WEBP_CSP_MODE src_colorspace);
//------------------------------------------------------------------------------
// Internal functions regarding WebPDecBuffer memory (in buffer.c).
// Don't really need to be externally visible for now.
// Prepare 'buffer' with the requested initial dimensions width/height.
// If no external storage is supplied, initializes buffer by allocating output
// memory and setting up the stride information. Validate the parameters. Return
// an error code in case of problem (no memory, or invalid stride / size /
// dimension / etc.). If *options is not NULL, also verify that the options'
// parameters are valid and apply them to the width/height dimensions of the
// output buffer. This takes cropping / scaling / rotation into account.
// Also incorporates the options->flip flag to flip the buffer parameters if
// needed.
VP8StatusCode WebPAllocateDecBuffer(int width, int height,
const WebPDecoderOptions* const options,
WebPDecBuffer* const buffer);
// Flip buffer vertically by negating the various strides.
VP8StatusCode WebPFlipBuffer(WebPDecBuffer* const buffer);
// Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the
// memory (still held by 'src'). No pixels are copied.
void WebPCopyDecBuffer(const WebPDecBuffer* const src,
WebPDecBuffer* const dst);
// Copy and transfer ownership from src to dst (beware of parameter order!)
void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
// Copy pixels from 'src' into a *preallocated* 'dst' buffer. Returns
// VP8_STATUS_INVALID_PARAM if the 'dst' is not set up correctly for the copy.
VP8StatusCode WebPCopyDecBufferPixels(const WebPDecBuffer* const src,
WebPDecBuffer* const dst);
// Returns true if decoding will be slow with the current configuration
// and bitstream features.
int WebPAvoidSlowMemory(const WebPDecBuffer* const output,
const WebPBitstreamFeatures* const features);
//------------------------------------------------------------------------------
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DEC_WEBPI_DEC_H_

View File

@ -0,0 +1,457 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// AnimDecoder implementation.
//
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <assert.h>
#include <string.h>
#include "utils_utils.h"
#include "webp_decode.h"
#include "webp_demux.h"
#define NUM_CHANNELS 4
typedef void (*BlendRowFunc)(uint32_t* const, const uint32_t* const, int);
static void BlendPixelRowNonPremult(uint32_t* const src,
const uint32_t* const dst, int num_pixels);
static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
int num_pixels);
struct WebPAnimDecoder {
WebPDemuxer* demux_; // Demuxer created from given WebP bitstream.
WebPDecoderConfig config_; // Decoder config.
// Note: we use a pointer to a function blending multiple pixels at a time to
// allow possible inlining of per-pixel blending function.
BlendRowFunc blend_func_; // Pointer to the chose blend row function.
WebPAnimInfo info_; // Global info about the animation.
uint8_t* curr_frame_; // Current canvas (not disposed).
uint8_t* prev_frame_disposed_; // Previous canvas (properly disposed).
int prev_frame_timestamp_; // Previous frame timestamp (milliseconds).
WebPIterator prev_iter_; // Iterator object for previous frame.
int prev_frame_was_keyframe_; // True if previous frame was a keyframe.
int next_frame_; // Index of the next frame to be decoded
// (starting from 1).
};
static void DefaultDecoderOptions(WebPAnimDecoderOptions* const dec_options) {
dec_options->color_mode = MODE_RGBA;
dec_options->use_threads = 0;
}
int WebPAnimDecoderOptionsInitInternal(WebPAnimDecoderOptions* dec_options,
int abi_version) {
if (dec_options == NULL ||
WEBP_ABI_IS_INCOMPATIBLE(abi_version, WEBP_DEMUX_ABI_VERSION)) {
return 0;
}
DefaultDecoderOptions(dec_options);
return 1;
}
static int ApplyDecoderOptions(const WebPAnimDecoderOptions* const dec_options,
WebPAnimDecoder* const dec) {
WEBP_CSP_MODE mode;
WebPDecoderConfig* config = &dec->config_;
assert(dec_options != NULL);
mode = dec_options->color_mode;
if (mode != MODE_RGBA && mode != MODE_BGRA &&
mode != MODE_rgbA && mode != MODE_bgrA) {
return 0;
}
dec->blend_func_ = (mode == MODE_RGBA || mode == MODE_BGRA)
? &BlendPixelRowNonPremult
: &BlendPixelRowPremult;
WebPInitDecoderConfig(config);
config->output.colorspace = mode;
config->output.is_external_memory = 1;
config->options.use_threads = dec_options->use_threads;
// Note: config->output.u.RGBA is set at the time of decoding each frame.
return 1;
}
WebPAnimDecoder* WebPAnimDecoderNewInternal(
const WebPData* webp_data, const WebPAnimDecoderOptions* dec_options,
int abi_version) {
WebPAnimDecoderOptions options;
WebPAnimDecoder* dec = NULL;
if (webp_data == NULL ||
WEBP_ABI_IS_INCOMPATIBLE(abi_version, WEBP_DEMUX_ABI_VERSION)) {
return NULL;
}
// Note: calloc() so that the pointer members are initialized to NULL.
dec = (WebPAnimDecoder*)WebPSafeCalloc(1ULL, sizeof(*dec));
if (dec == NULL) goto Error;
if (dec_options != NULL) {
options = *dec_options;
} else {
DefaultDecoderOptions(&options);
}
if (!ApplyDecoderOptions(&options, dec)) goto Error;
dec->demux_ = WebPDemux(webp_data);
if (dec->demux_ == NULL) goto Error;
dec->info_.canvas_width = WebPDemuxGetI(dec->demux_, WEBP_FF_CANVAS_WIDTH);
dec->info_.canvas_height = WebPDemuxGetI(dec->demux_, WEBP_FF_CANVAS_HEIGHT);
dec->info_.loop_count = WebPDemuxGetI(dec->demux_, WEBP_FF_LOOP_COUNT);
dec->info_.bgcolor = WebPDemuxGetI(dec->demux_, WEBP_FF_BACKGROUND_COLOR);
dec->info_.frame_count = WebPDemuxGetI(dec->demux_, WEBP_FF_FRAME_COUNT);
// Note: calloc() because we fill frame with zeroes as well.
dec->curr_frame_ = (uint8_t*)WebPSafeCalloc(
dec->info_.canvas_width * NUM_CHANNELS, dec->info_.canvas_height);
if (dec->curr_frame_ == NULL) goto Error;
dec->prev_frame_disposed_ = (uint8_t*)WebPSafeCalloc(
dec->info_.canvas_width * NUM_CHANNELS, dec->info_.canvas_height);
if (dec->prev_frame_disposed_ == NULL) goto Error;
WebPAnimDecoderReset(dec);
return dec;
Error:
WebPAnimDecoderDelete(dec);
return NULL;
}
int WebPAnimDecoderGetInfo(const WebPAnimDecoder* dec, WebPAnimInfo* info) {
if (dec == NULL || info == NULL) return 0;
*info = dec->info_;
return 1;
}
// Returns true if the frame covers the full canvas.
static int IsFullFrame(int width, int height, int canvas_width,
int canvas_height) {
return (width == canvas_width && height == canvas_height);
}
// Clear the canvas to transparent.
static int ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
uint32_t canvas_height) {
const uint64_t size =
(uint64_t)canvas_width * canvas_height * NUM_CHANNELS * sizeof(*buf);
if (size != (size_t)size) return 0;
memset(buf, 0, (size_t)size);
return 1;
}
// Clear given frame rectangle to transparent.
static void ZeroFillFrameRect(uint8_t* buf, int buf_stride, int x_offset,
int y_offset, int width, int height) {
int j;
assert(width * NUM_CHANNELS <= buf_stride);
buf += y_offset * buf_stride + x_offset * NUM_CHANNELS;
for (j = 0; j < height; ++j) {
memset(buf, 0, width * NUM_CHANNELS);
buf += buf_stride;
}
}
// Copy width * height pixels from 'src' to 'dst'.
static int CopyCanvas(const uint8_t* src, uint8_t* dst,
uint32_t width, uint32_t height) {
const uint64_t size = (uint64_t)width * height * NUM_CHANNELS;
if (size != (size_t)size) return 0;
assert(src != NULL && dst != NULL);
memcpy(dst, src, (size_t)size);
return 1;
}
// Returns true if the current frame is a key-frame.
static int IsKeyFrame(const WebPIterator* const curr,
const WebPIterator* const prev,
int prev_frame_was_key_frame,
int canvas_width, int canvas_height) {
if (curr->frame_num == 1) {
return 1;
} else if ((!curr->has_alpha || curr->blend_method == WEBP_MUX_NO_BLEND) &&
IsFullFrame(curr->width, curr->height,
canvas_width, canvas_height)) {
return 1;
} else {
return (prev->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) &&
(IsFullFrame(prev->width, prev->height, canvas_width,
canvas_height) ||
prev_frame_was_key_frame);
}
}
// Blend a single channel of 'src' over 'dst', given their alpha channel values.
// 'src' and 'dst' are assumed to be NOT pre-multiplied by alpha.
static uint8_t BlendChannelNonPremult(uint32_t src, uint8_t src_a,
uint32_t dst, uint8_t dst_a,
uint32_t scale, int shift) {
const uint8_t src_channel = (src >> shift) & 0xff;
const uint8_t dst_channel = (dst >> shift) & 0xff;
const uint32_t blend_unscaled = src_channel * src_a + dst_channel * dst_a;
assert(blend_unscaled < (1ULL << 32) / scale);
return (blend_unscaled * scale) >> 24;
}
// Blend 'src' over 'dst' assuming they are NOT pre-multiplied by alpha.
static uint32_t BlendPixelNonPremult(uint32_t src, uint32_t dst) {
const uint8_t src_a = (src >> 24) & 0xff;
if (src_a == 0) {
return dst;
} else {
const uint8_t dst_a = (dst >> 24) & 0xff;
// This is the approximate integer arithmetic for the actual formula:
// dst_factor_a = (dst_a * (255 - src_a)) / 255.
const uint8_t dst_factor_a = (dst_a * (256 - src_a)) >> 8;
const uint8_t blend_a = src_a + dst_factor_a;
const uint32_t scale = (1UL << 24) / blend_a;
const uint8_t blend_r =
BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 0);
const uint8_t blend_g =
BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 8);
const uint8_t blend_b =
BlendChannelNonPremult(src, src_a, dst, dst_factor_a, scale, 16);
assert(src_a + dst_factor_a < 256);
return (blend_r << 0) |
(blend_g << 8) |
(blend_b << 16) |
((uint32_t)blend_a << 24);
}
}
// Blend 'num_pixels' in 'src' over 'dst' assuming they are NOT pre-multiplied
// by alpha.
static void BlendPixelRowNonPremult(uint32_t* const src,
const uint32_t* const dst, int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint8_t src_alpha = (src[i] >> 24) & 0xff;
if (src_alpha != 0xff) {
src[i] = BlendPixelNonPremult(src[i], dst[i]);
}
}
}
// Individually multiply each channel in 'pix' by 'scale'.
static WEBP_INLINE uint32_t ChannelwiseMultiply(uint32_t pix, uint32_t scale) {
uint32_t mask = 0x00FF00FF;
uint32_t rb = ((pix & mask) * scale) >> 8;
uint32_t ag = ((pix >> 8) & mask) * scale;
return (rb & mask) | (ag & ~mask);
}
// Blend 'src' over 'dst' assuming they are pre-multiplied by alpha.
static uint32_t BlendPixelPremult(uint32_t src, uint32_t dst) {
const uint8_t src_a = (src >> 24) & 0xff;
return src + ChannelwiseMultiply(dst, 256 - src_a);
}
// Blend 'num_pixels' in 'src' over 'dst' assuming they are pre-multiplied by
// alpha.
static void BlendPixelRowPremult(uint32_t* const src, const uint32_t* const dst,
int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint8_t src_alpha = (src[i] >> 24) & 0xff;
if (src_alpha != 0xff) {
src[i] = BlendPixelPremult(src[i], dst[i]);
}
}
}
// Returns two ranges (<left, width> pairs) at row 'canvas_y', that belong to
// 'src' but not 'dst'. A point range is empty if the corresponding width is 0.
static void FindBlendRangeAtRow(const WebPIterator* const src,
const WebPIterator* const dst, int canvas_y,
int* const left1, int* const width1,
int* const left2, int* const width2) {
const int src_max_x = src->x_offset + src->width;
const int dst_max_x = dst->x_offset + dst->width;
const int dst_max_y = dst->y_offset + dst->height;
assert(canvas_y >= src->y_offset && canvas_y < (src->y_offset + src->height));
*left1 = -1;
*width1 = 0;
*left2 = -1;
*width2 = 0;
if (canvas_y < dst->y_offset || canvas_y >= dst_max_y ||
src->x_offset >= dst_max_x || src_max_x <= dst->x_offset) {
*left1 = src->x_offset;
*width1 = src->width;
return;
}
if (src->x_offset < dst->x_offset) {
*left1 = src->x_offset;
*width1 = dst->x_offset - src->x_offset;
}
if (src_max_x > dst_max_x) {
*left2 = dst_max_x;
*width2 = src_max_x - dst_max_x;
}
}
int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
uint8_t** buf_ptr, int* timestamp_ptr) {
WebPIterator iter;
uint32_t width;
uint32_t height;
int is_key_frame;
int timestamp;
BlendRowFunc blend_row;
if (dec == NULL || buf_ptr == NULL || timestamp_ptr == NULL) return 0;
if (!WebPAnimDecoderHasMoreFrames(dec)) return 0;
width = dec->info_.canvas_width;
height = dec->info_.canvas_height;
blend_row = dec->blend_func_;
// Get compressed frame.
if (!WebPDemuxGetFrame(dec->demux_, dec->next_frame_, &iter)) {
return 0;
}
timestamp = dec->prev_frame_timestamp_ + iter.duration;
// Initialize.
is_key_frame = IsKeyFrame(&iter, &dec->prev_iter_,
dec->prev_frame_was_keyframe_, width, height);
if (is_key_frame) {
if (!ZeroFillCanvas(dec->curr_frame_, width, height)) {
goto Error;
}
} else {
if (!CopyCanvas(dec->prev_frame_disposed_, dec->curr_frame_,
width, height)) {
goto Error;
}
}
// Decode.
{
const uint8_t* in = iter.fragment.bytes;
const size_t in_size = iter.fragment.size;
const uint32_t stride = width * NUM_CHANNELS; // at most 25 + 2 bits
const uint64_t out_offset = (uint64_t)iter.y_offset * stride +
(uint64_t)iter.x_offset * NUM_CHANNELS; // 53b
const uint64_t size = (uint64_t)iter.height * stride; // at most 25 + 27b
WebPDecoderConfig* const config = &dec->config_;
WebPRGBABuffer* const buf = &config->output.u.RGBA;
if ((size_t)size != size) goto Error;
buf->stride = (int)stride;
buf->size = (size_t)size;
buf->rgba = dec->curr_frame_ + out_offset;
if (WebPDecode(in, in_size, config) != VP8_STATUS_OK) {
goto Error;
}
}
// During the decoding of current frame, we may have set some pixels to be
// transparent (i.e. alpha < 255). However, the value of each of these
// pixels should have been determined by blending it against the value of
// that pixel in the previous frame if blending method of is WEBP_MUX_BLEND.
if (iter.frame_num > 1 && iter.blend_method == WEBP_MUX_BLEND &&
!is_key_frame) {
if (dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_NONE) {
int y;
// Blend transparent pixels with pixels in previous canvas.
for (y = 0; y < iter.height; ++y) {
const size_t offset =
(iter.y_offset + y) * width + iter.x_offset;
blend_row((uint32_t*)dec->curr_frame_ + offset,
(uint32_t*)dec->prev_frame_disposed_ + offset, iter.width);
}
} else {
int y;
assert(dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND);
// We need to blend a transparent pixel with its value just after
// initialization. That is, blend it with:
// * Fully transparent pixel if it belongs to prevRect <-- No-op.
// * The pixel in the previous canvas otherwise <-- Need alpha-blending.
for (y = 0; y < iter.height; ++y) {
const int canvas_y = iter.y_offset + y;
int left1, width1, left2, width2;
FindBlendRangeAtRow(&iter, &dec->prev_iter_, canvas_y, &left1, &width1,
&left2, &width2);
if (width1 > 0) {
const size_t offset1 = canvas_y * width + left1;
blend_row((uint32_t*)dec->curr_frame_ + offset1,
(uint32_t*)dec->prev_frame_disposed_ + offset1, width1);
}
if (width2 > 0) {
const size_t offset2 = canvas_y * width + left2;
blend_row((uint32_t*)dec->curr_frame_ + offset2,
(uint32_t*)dec->prev_frame_disposed_ + offset2, width2);
}
}
}
}
// Update info of the previous frame and dispose it for the next iteration.
dec->prev_frame_timestamp_ = timestamp;
WebPDemuxReleaseIterator(&dec->prev_iter_);
dec->prev_iter_ = iter;
dec->prev_frame_was_keyframe_ = is_key_frame;
CopyCanvas(dec->curr_frame_, dec->prev_frame_disposed_, width, height);
if (dec->prev_iter_.dispose_method == WEBP_MUX_DISPOSE_BACKGROUND) {
ZeroFillFrameRect(dec->prev_frame_disposed_, width * NUM_CHANNELS,
dec->prev_iter_.x_offset, dec->prev_iter_.y_offset,
dec->prev_iter_.width, dec->prev_iter_.height);
}
++dec->next_frame_;
// All OK, fill in the values.
*buf_ptr = dec->curr_frame_;
*timestamp_ptr = timestamp;
return 1;
Error:
WebPDemuxReleaseIterator(&iter);
return 0;
}
int WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec) {
if (dec == NULL) return 0;
return (dec->next_frame_ <= (int)dec->info_.frame_count);
}
void WebPAnimDecoderReset(WebPAnimDecoder* dec) {
if (dec != NULL) {
dec->prev_frame_timestamp_ = 0;
WebPDemuxReleaseIterator(&dec->prev_iter_);
memset(&dec->prev_iter_, 0, sizeof(dec->prev_iter_));
dec->prev_frame_was_keyframe_ = 0;
dec->next_frame_ = 1;
}
}
const WebPDemuxer* WebPAnimDecoderGetDemuxer(const WebPAnimDecoder* dec) {
if (dec == NULL) return NULL;
return dec->demux_;
}
void WebPAnimDecoderDelete(WebPAnimDecoder* dec) {
if (dec != NULL) {
WebPDemuxReleaseIterator(&dec->prev_iter_);
WebPDemuxDelete(dec->demux_);
WebPSafeFree(dec->curr_frame_);
WebPSafeFree(dec->prev_frame_disposed_);
WebPSafeFree(dec);
}
}

972
vendor/github.com/sizeofint/webpanimation/demux_demux.c generated vendored Normal file
View File

@ -0,0 +1,972 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// WebP container demux.
//
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <assert.h>
#include <stdlib.h>
#include <string.h>
#include "utils_utils.h"
#include "webp_decode.h"
#include "webp_demux.h"
#include "webp_format_constants.h"
#define DMUX_MAJ_VERSION 1
#define DMUX_MIN_VERSION 2
#define DMUX_REV_VERSION 0
typedef struct {
size_t start_; // start location of the data
size_t end_; // end location
size_t riff_end_; // riff chunk end location, can be > end_.
size_t buf_size_; // size of the buffer
const uint8_t* buf_;
} MemBuffer;
typedef struct {
size_t offset_;
size_t size_;
} ChunkData;
typedef struct Frame {
int x_offset_, y_offset_;
int width_, height_;
int has_alpha_;
int duration_;
WebPMuxAnimDispose dispose_method_;
WebPMuxAnimBlend blend_method_;
int frame_num_;
int complete_; // img_components_ contains a full image.
ChunkData img_components_[2]; // 0=VP8{,L} 1=ALPH
struct Frame* next_;
} Frame;
typedef struct Chunk {
ChunkData data_;
struct Chunk* next_;
} Chunk;
struct WebPDemuxer {
MemBuffer mem_;
WebPDemuxState state_;
int is_ext_format_;
uint32_t feature_flags_;
int canvas_width_, canvas_height_;
int loop_count_;
uint32_t bgcolor_;
int num_frames_;
Frame* frames_;
Frame** frames_tail_;
Chunk* chunks_; // non-image chunks
Chunk** chunks_tail_;
};
typedef enum {
PARSE_OK,
PARSE_NEED_MORE_DATA,
PARSE_ERROR
} ParseStatus;
typedef struct ChunkParser {
uint8_t id[4];
ParseStatus (*parse)(WebPDemuxer* const dmux);
int (*valid)(const WebPDemuxer* const dmux);
} ChunkParser;
static ParseStatus ParseSingleImage(WebPDemuxer* const dmux);
static ParseStatus ParseVP8X(WebPDemuxer* const dmux);
static int IsValidSimpleFormat(const WebPDemuxer* const dmux);
static int IsValidExtendedFormat(const WebPDemuxer* const dmux);
static const ChunkParser kMasterChunks[] = {
{ { 'V', 'P', '8', ' ' }, ParseSingleImage, IsValidSimpleFormat },
{ { 'V', 'P', '8', 'L' }, ParseSingleImage, IsValidSimpleFormat },
{ { 'V', 'P', '8', 'X' }, ParseVP8X, IsValidExtendedFormat },
{ { '0', '0', '0', '0' }, NULL, NULL },
};
//------------------------------------------------------------------------------
int WebPGetDemuxVersion(void) {
return (DMUX_MAJ_VERSION << 16) | (DMUX_MIN_VERSION << 8) | DMUX_REV_VERSION;
}
// -----------------------------------------------------------------------------
// MemBuffer
static int RemapMemBuffer(MemBuffer* const mem,
const uint8_t* data, size_t size) {
if (size < mem->buf_size_) return 0; // can't remap to a shorter buffer!
mem->buf_ = data;
mem->end_ = mem->buf_size_ = size;
return 1;
}
static int InitMemBuffer(MemBuffer* const mem,
const uint8_t* data, size_t size) {
memset(mem, 0, sizeof(*mem));
return RemapMemBuffer(mem, data, size);
}
// Return the remaining data size available in 'mem'.
static WEBP_INLINE size_t MemDataSize(const MemBuffer* const mem) {
return (mem->end_ - mem->start_);
}
// Return true if 'size' exceeds the end of the RIFF chunk.
static WEBP_INLINE int SizeIsInvalid(const MemBuffer* const mem, size_t size) {
return (size > mem->riff_end_ - mem->start_);
}
static WEBP_INLINE void Skip(MemBuffer* const mem, size_t size) {
mem->start_ += size;
}
static WEBP_INLINE void Rewind(MemBuffer* const mem, size_t size) {
mem->start_ -= size;
}
static WEBP_INLINE const uint8_t* GetBuffer(MemBuffer* const mem) {
return mem->buf_ + mem->start_;
}
// Read from 'mem' and skip the read bytes.
static WEBP_INLINE uint8_t ReadByte(MemBuffer* const mem) {
const uint8_t byte = mem->buf_[mem->start_];
Skip(mem, 1);
return byte;
}
static WEBP_INLINE int ReadLE16s(MemBuffer* const mem) {
const uint8_t* const data = mem->buf_ + mem->start_;
const int val = GetLE16(data);
Skip(mem, 2);
return val;
}
static WEBP_INLINE int ReadLE24s(MemBuffer* const mem) {
const uint8_t* const data = mem->buf_ + mem->start_;
const int val = GetLE24(data);
Skip(mem, 3);
return val;
}
static WEBP_INLINE uint32_t ReadLE32(MemBuffer* const mem) {
const uint8_t* const data = mem->buf_ + mem->start_;
const uint32_t val = GetLE32(data);
Skip(mem, 4);
return val;
}
// -----------------------------------------------------------------------------
// Secondary chunk parsing
static void AddChunk(WebPDemuxer* const dmux, Chunk* const chunk) {
*dmux->chunks_tail_ = chunk;
chunk->next_ = NULL;
dmux->chunks_tail_ = &chunk->next_;
}
// Add a frame to the end of the list, ensuring the last frame is complete.
// Returns true on success, false otherwise.
static int AddFrame(WebPDemuxer* const dmux, Frame* const frame) {
const Frame* const last_frame = *dmux->frames_tail_;
if (last_frame != NULL && !last_frame->complete_) return 0;
*dmux->frames_tail_ = frame;
frame->next_ = NULL;
dmux->frames_tail_ = &frame->next_;
return 1;
}
static void SetFrameInfo(size_t start_offset, size_t size,
int frame_num, int complete,
const WebPBitstreamFeatures* const features,
Frame* const frame) {
frame->img_components_[0].offset_ = start_offset;
frame->img_components_[0].size_ = size;
frame->width_ = features->width;
frame->height_ = features->height;
frame->has_alpha_ |= features->has_alpha;
frame->frame_num_ = frame_num;
frame->complete_ = complete;
}
// Store image bearing chunks to 'frame'. 'min_size' is an optional size
// requirement, it may be zero.
static ParseStatus StoreFrame(int frame_num, uint32_t min_size,
MemBuffer* const mem, Frame* const frame) {
int alpha_chunks = 0;
int image_chunks = 0;
int done = (MemDataSize(mem) < CHUNK_HEADER_SIZE ||
MemDataSize(mem) < min_size);
ParseStatus status = PARSE_OK;
if (done) return PARSE_NEED_MORE_DATA;
do {
const size_t chunk_start_offset = mem->start_;
const uint32_t fourcc = ReadLE32(mem);
const uint32_t payload_size = ReadLE32(mem);
const uint32_t payload_size_padded = payload_size + (payload_size & 1);
const size_t payload_available = (payload_size_padded > MemDataSize(mem))
? MemDataSize(mem) : payload_size_padded;
const size_t chunk_size = CHUNK_HEADER_SIZE + payload_available;
if (payload_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
if (SizeIsInvalid(mem, payload_size_padded)) return PARSE_ERROR;
if (payload_size_padded > MemDataSize(mem)) status = PARSE_NEED_MORE_DATA;
switch (fourcc) {
case MKFOURCC('A', 'L', 'P', 'H'):
if (alpha_chunks == 0) {
++alpha_chunks;
frame->img_components_[1].offset_ = chunk_start_offset;
frame->img_components_[1].size_ = chunk_size;
frame->has_alpha_ = 1;
frame->frame_num_ = frame_num;
Skip(mem, payload_available);
} else {
goto Done;
}
break;
case MKFOURCC('V', 'P', '8', 'L'):
if (alpha_chunks > 0) return PARSE_ERROR; // VP8L has its own alpha
// fall through
case MKFOURCC('V', 'P', '8', ' '):
if (image_chunks == 0) {
// Extract the bitstream features, tolerating failures when the data
// is incomplete.
WebPBitstreamFeatures features;
const VP8StatusCode vp8_status =
WebPGetFeatures(mem->buf_ + chunk_start_offset, chunk_size,
&features);
if (status == PARSE_NEED_MORE_DATA &&
vp8_status == VP8_STATUS_NOT_ENOUGH_DATA) {
return PARSE_NEED_MORE_DATA;
} else if (vp8_status != VP8_STATUS_OK) {
// We have enough data, and yet WebPGetFeatures() failed.
return PARSE_ERROR;
}
++image_chunks;
SetFrameInfo(chunk_start_offset, chunk_size, frame_num,
status == PARSE_OK, &features, frame);
Skip(mem, payload_available);
} else {
goto Done;
}
break;
Done:
default:
// Restore fourcc/size when moving up one level in parsing.
Rewind(mem, CHUNK_HEADER_SIZE);
done = 1;
break;
}
if (mem->start_ == mem->riff_end_) {
done = 1;
} else if (MemDataSize(mem) < CHUNK_HEADER_SIZE) {
status = PARSE_NEED_MORE_DATA;
}
} while (!done && status == PARSE_OK);
return status;
}
// Creates a new Frame if 'actual_size' is within bounds and 'mem' contains
// enough data ('min_size') to parse the payload.
// Returns PARSE_OK on success with *frame pointing to the new Frame.
// Returns PARSE_NEED_MORE_DATA with insufficient data, PARSE_ERROR otherwise.
static ParseStatus NewFrame(const MemBuffer* const mem,
uint32_t min_size, uint32_t actual_size,
Frame** frame) {
if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
if (actual_size < min_size) return PARSE_ERROR;
if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
*frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(**frame));
return (*frame == NULL) ? PARSE_ERROR : PARSE_OK;
}
// Parse a 'ANMF' chunk and any image bearing chunks that immediately follow.
// 'frame_chunk_size' is the previously validated, padded chunk size.
static ParseStatus ParseAnimationFrame(
WebPDemuxer* const dmux, uint32_t frame_chunk_size) {
const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
const uint32_t anmf_payload_size = frame_chunk_size - ANMF_CHUNK_SIZE;
int added_frame = 0;
int bits;
MemBuffer* const mem = &dmux->mem_;
Frame* frame;
size_t start_offset;
ParseStatus status =
NewFrame(mem, ANMF_CHUNK_SIZE, frame_chunk_size, &frame);
if (status != PARSE_OK) return status;
frame->x_offset_ = 2 * ReadLE24s(mem);
frame->y_offset_ = 2 * ReadLE24s(mem);
frame->width_ = 1 + ReadLE24s(mem);
frame->height_ = 1 + ReadLE24s(mem);
frame->duration_ = ReadLE24s(mem);
bits = ReadByte(mem);
frame->dispose_method_ =
(bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
frame->blend_method_ = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
if (frame->width_ * (uint64_t)frame->height_ >= MAX_IMAGE_AREA) {
WebPSafeFree(frame);
return PARSE_ERROR;
}
// Store a frame only if the animation flag is set there is some data for
// this frame is available.
start_offset = mem->start_;
status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
if (status != PARSE_ERROR && mem->start_ - start_offset > anmf_payload_size) {
status = PARSE_ERROR;
}
if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
added_frame = AddFrame(dmux, frame);
if (added_frame) {
++dmux->num_frames_;
} else {
status = PARSE_ERROR;
}
}
if (!added_frame) WebPSafeFree(frame);
return status;
}
// General chunk storage, starting with the header at 'start_offset', allowing
// the user to request the payload via a fourcc string. 'size' includes the
// header and the unpadded payload size.
// Returns true on success, false otherwise.
static int StoreChunk(WebPDemuxer* const dmux,
size_t start_offset, uint32_t size) {
Chunk* const chunk = (Chunk*)WebPSafeCalloc(1ULL, sizeof(*chunk));
if (chunk == NULL) return 0;
chunk->data_.offset_ = start_offset;
chunk->data_.size_ = size;
AddChunk(dmux, chunk);
return 1;
}
// -----------------------------------------------------------------------------
// Primary chunk parsing
static ParseStatus ReadHeader(MemBuffer* const mem) {
const size_t min_size = RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE;
uint32_t riff_size;
// Basic file level validation.
if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
if (memcmp(GetBuffer(mem), "RIFF", CHUNK_SIZE_BYTES) ||
memcmp(GetBuffer(mem) + CHUNK_HEADER_SIZE, "WEBP", CHUNK_SIZE_BYTES)) {
return PARSE_ERROR;
}
riff_size = GetLE32(GetBuffer(mem) + TAG_SIZE);
if (riff_size < CHUNK_HEADER_SIZE) return PARSE_ERROR;
if (riff_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
// There's no point in reading past the end of the RIFF chunk
mem->riff_end_ = riff_size + CHUNK_HEADER_SIZE;
if (mem->buf_size_ > mem->riff_end_) {
mem->buf_size_ = mem->end_ = mem->riff_end_;
}
Skip(mem, RIFF_HEADER_SIZE);
return PARSE_OK;
}
static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
const size_t min_size = CHUNK_HEADER_SIZE;
MemBuffer* const mem = &dmux->mem_;
Frame* frame;
ParseStatus status;
int image_added = 0;
if (dmux->frames_ != NULL) return PARSE_ERROR;
if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
if (frame == NULL) return PARSE_ERROR;
// For the single image case we allow parsing of a partial frame, so no
// minimum size is imposed here.
status = StoreFrame(1, 0, &dmux->mem_, frame);
if (status != PARSE_ERROR) {
const int has_alpha = !!(dmux->feature_flags_ & ALPHA_FLAG);
// Clear any alpha when the alpha flag is missing.
if (!has_alpha && frame->img_components_[1].size_ > 0) {
frame->img_components_[1].offset_ = 0;
frame->img_components_[1].size_ = 0;
frame->has_alpha_ = 0;
}
// Use the frame width/height as the canvas values for non-vp8x files.
// Also, set ALPHA_FLAG if this is a lossless image with alpha.
if (!dmux->is_ext_format_ && frame->width_ > 0 && frame->height_ > 0) {
dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
dmux->canvas_width_ = frame->width_;
dmux->canvas_height_ = frame->height_;
dmux->feature_flags_ |= frame->has_alpha_ ? ALPHA_FLAG : 0;
}
if (!AddFrame(dmux, frame)) {
status = PARSE_ERROR; // last frame was left incomplete
} else {
image_added = 1;
dmux->num_frames_ = 1;
}
}
if (!image_added) WebPSafeFree(frame);
return status;
}
static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
MemBuffer* const mem = &dmux->mem_;
int anim_chunks = 0;
ParseStatus status = PARSE_OK;
do {
int store_chunk = 1;
const size_t chunk_start_offset = mem->start_;
const uint32_t fourcc = ReadLE32(mem);
const uint32_t chunk_size = ReadLE32(mem);
const uint32_t chunk_size_padded = chunk_size + (chunk_size & 1);
if (chunk_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
if (SizeIsInvalid(mem, chunk_size_padded)) return PARSE_ERROR;
switch (fourcc) {
case MKFOURCC('V', 'P', '8', 'X'): {
return PARSE_ERROR;
}
case MKFOURCC('A', 'L', 'P', 'H'):
case MKFOURCC('V', 'P', '8', ' '):
case MKFOURCC('V', 'P', '8', 'L'): {
// check that this isn't an animation (all frames should be in an ANMF).
if (anim_chunks > 0 || is_animation) return PARSE_ERROR;
Rewind(mem, CHUNK_HEADER_SIZE);
status = ParseSingleImage(dmux);
break;
}
case MKFOURCC('A', 'N', 'I', 'M'): {
if (chunk_size_padded < ANIM_CHUNK_SIZE) return PARSE_ERROR;
if (MemDataSize(mem) < chunk_size_padded) {
status = PARSE_NEED_MORE_DATA;
} else if (anim_chunks == 0) {
++anim_chunks;
dmux->bgcolor_ = ReadLE32(mem);
dmux->loop_count_ = ReadLE16s(mem);
Skip(mem, chunk_size_padded - ANIM_CHUNK_SIZE);
} else {
store_chunk = 0;
goto Skip;
}
break;
}
case MKFOURCC('A', 'N', 'M', 'F'): {
if (anim_chunks == 0) return PARSE_ERROR; // 'ANIM' precedes frames.
status = ParseAnimationFrame(dmux, chunk_size_padded);
break;
}
case MKFOURCC('I', 'C', 'C', 'P'): {
store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
goto Skip;
}
case MKFOURCC('E', 'X', 'I', 'F'): {
store_chunk = !!(dmux->feature_flags_ & EXIF_FLAG);
goto Skip;
}
case MKFOURCC('X', 'M', 'P', ' '): {
store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
goto Skip;
}
Skip:
default: {
if (chunk_size_padded <= MemDataSize(mem)) {
if (store_chunk) {
// Store only the chunk header and unpadded size as only the payload
// will be returned to the user.
if (!StoreChunk(dmux, chunk_start_offset,
CHUNK_HEADER_SIZE + chunk_size)) {
return PARSE_ERROR;
}
}
Skip(mem, chunk_size_padded);
} else {
status = PARSE_NEED_MORE_DATA;
}
}
}
if (mem->start_ == mem->riff_end_) {
break;
} else if (MemDataSize(mem) < CHUNK_HEADER_SIZE) {
status = PARSE_NEED_MORE_DATA;
}
} while (status == PARSE_OK);
return status;
}
static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
MemBuffer* const mem = &dmux->mem_;
uint32_t vp8x_size;
if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
dmux->is_ext_format_ = 1;
Skip(mem, TAG_SIZE); // VP8X
vp8x_size = ReadLE32(mem);
if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
vp8x_size += vp8x_size & 1;
if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
dmux->feature_flags_ = ReadByte(mem);
Skip(mem, 3); // Reserved.
dmux->canvas_width_ = 1 + ReadLE24s(mem);
dmux->canvas_height_ = 1 + ReadLE24s(mem);
if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
return PARSE_ERROR; // image final dimension is too large
}
Skip(mem, vp8x_size - VP8X_CHUNK_SIZE); // skip any trailing data.
dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
return ParseVP8XChunks(dmux);
}
// -----------------------------------------------------------------------------
// Format validation
static int IsValidSimpleFormat(const WebPDemuxer* const dmux) {
const Frame* const frame = dmux->frames_;
if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
if (dmux->state_ == WEBP_DEMUX_DONE && frame == NULL) return 0;
if (frame->width_ <= 0 || frame->height_ <= 0) return 0;
return 1;
}
// If 'exact' is true, check that the image resolution matches the canvas.
// If 'exact' is false, check that the x/y offsets do not exceed the canvas.
static int CheckFrameBounds(const Frame* const frame, int exact,
int canvas_width, int canvas_height) {
if (exact) {
if (frame->x_offset_ != 0 || frame->y_offset_ != 0) {
return 0;
}
if (frame->width_ != canvas_width || frame->height_ != canvas_height) {
return 0;
}
} else {
if (frame->x_offset_ < 0 || frame->y_offset_ < 0) return 0;
if (frame->width_ + frame->x_offset_ > canvas_width) return 0;
if (frame->height_ + frame->y_offset_ > canvas_height) return 0;
}
return 1;
}
static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
const Frame* f = dmux->frames_;
if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
if (dmux->loop_count_ < 0) return 0;
if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
if (dmux->feature_flags_ & ~ALL_VALID_FLAGS) return 0; // invalid bitstream
while (f != NULL) {
const int cur_frame_set = f->frame_num_;
int frame_count = 0;
// Check frame properties.
for (; f != NULL && f->frame_num_ == cur_frame_set; f = f->next_) {
const ChunkData* const image = f->img_components_;
const ChunkData* const alpha = f->img_components_ + 1;
if (!is_animation && f->frame_num_ > 1) return 0;
if (f->complete_) {
if (alpha->size_ == 0 && image->size_ == 0) return 0;
// Ensure alpha precedes image bitstream.
if (alpha->size_ > 0 && alpha->offset_ > image->offset_) {
return 0;
}
if (f->width_ <= 0 || f->height_ <= 0) return 0;
} else {
// There shouldn't be a partial frame in a complete file.
if (dmux->state_ == WEBP_DEMUX_DONE) return 0;
// Ensure alpha precedes image bitstream.
if (alpha->size_ > 0 && image->size_ > 0 &&
alpha->offset_ > image->offset_) {
return 0;
}
// There shouldn't be any frames after an incomplete one.
if (f->next_ != NULL) return 0;
}
if (f->width_ > 0 && f->height_ > 0 &&
!CheckFrameBounds(f, !is_animation,
dmux->canvas_width_, dmux->canvas_height_)) {
return 0;
}
++frame_count;
}
}
return 1;
}
// -----------------------------------------------------------------------------
// WebPDemuxer object
static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
dmux->state_ = WEBP_DEMUX_PARSING_HEADER;
dmux->loop_count_ = 1;
dmux->bgcolor_ = 0xFFFFFFFF; // White background by default.
dmux->canvas_width_ = -1;
dmux->canvas_height_ = -1;
dmux->frames_tail_ = &dmux->frames_;
dmux->chunks_tail_ = &dmux->chunks_;
dmux->mem_ = *mem;
}
static ParseStatus CreateRawImageDemuxer(MemBuffer* const mem,
WebPDemuxer** demuxer) {
WebPBitstreamFeatures features;
const VP8StatusCode status =
WebPGetFeatures(mem->buf_, mem->buf_size_, &features);
*demuxer = NULL;
if (status != VP8_STATUS_OK) {
return (status == VP8_STATUS_NOT_ENOUGH_DATA) ? PARSE_NEED_MORE_DATA
: PARSE_ERROR;
}
{
WebPDemuxer* const dmux = (WebPDemuxer*)WebPSafeCalloc(1ULL, sizeof(*dmux));
Frame* const frame = (Frame*)WebPSafeCalloc(1ULL, sizeof(*frame));
if (dmux == NULL || frame == NULL) goto Error;
InitDemux(dmux, mem);
SetFrameInfo(0, mem->buf_size_, 1 /*frame_num*/, 1 /*complete*/, &features,
frame);
if (!AddFrame(dmux, frame)) goto Error;
dmux->state_ = WEBP_DEMUX_DONE;
dmux->canvas_width_ = frame->width_;
dmux->canvas_height_ = frame->height_;
dmux->feature_flags_ |= frame->has_alpha_ ? ALPHA_FLAG : 0;
dmux->num_frames_ = 1;
assert(IsValidSimpleFormat(dmux));
*demuxer = dmux;
return PARSE_OK;
Error:
WebPSafeFree(dmux);
WebPSafeFree(frame);
return PARSE_ERROR;
}
}
WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
WebPDemuxState* state, int version) {
const ChunkParser* parser;
int partial;
ParseStatus status = PARSE_ERROR;
MemBuffer mem;
WebPDemuxer* dmux;
if (state != NULL) *state = WEBP_DEMUX_PARSE_ERROR;
if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DEMUX_ABI_VERSION)) return NULL;
if (data == NULL || data->bytes == NULL || data->size == 0) return NULL;
if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
status = ReadHeader(&mem);
if (status != PARSE_OK) {
// If parsing of the webp file header fails attempt to handle a raw
// VP8/VP8L frame. Note 'allow_partial' is ignored in this case.
if (status == PARSE_ERROR) {
status = CreateRawImageDemuxer(&mem, &dmux);
if (status == PARSE_OK) {
if (state != NULL) *state = WEBP_DEMUX_DONE;
return dmux;
}
}
if (state != NULL) {
*state = (status == PARSE_NEED_MORE_DATA) ? WEBP_DEMUX_PARSING_HEADER
: WEBP_DEMUX_PARSE_ERROR;
}
return NULL;
}
partial = (mem.buf_size_ < mem.riff_end_);
if (!allow_partial && partial) return NULL;
dmux = (WebPDemuxer*)WebPSafeCalloc(1ULL, sizeof(*dmux));
if (dmux == NULL) return NULL;
InitDemux(dmux, &mem);
status = PARSE_ERROR;
for (parser = kMasterChunks; parser->parse != NULL; ++parser) {
if (!memcmp(parser->id, GetBuffer(&dmux->mem_), TAG_SIZE)) {
status = parser->parse(dmux);
if (status == PARSE_OK) dmux->state_ = WEBP_DEMUX_DONE;
if (status == PARSE_NEED_MORE_DATA && !partial) status = PARSE_ERROR;
if (status != PARSE_ERROR && !parser->valid(dmux)) status = PARSE_ERROR;
if (status == PARSE_ERROR) dmux->state_ = WEBP_DEMUX_PARSE_ERROR;
break;
}
}
if (state != NULL) *state = dmux->state_;
if (status == PARSE_ERROR) {
WebPDemuxDelete(dmux);
return NULL;
}
return dmux;
}
void WebPDemuxDelete(WebPDemuxer* dmux) {
Chunk* c;
Frame* f;
if (dmux == NULL) return;
for (f = dmux->frames_; f != NULL;) {
Frame* const cur_frame = f;
f = f->next_;
WebPSafeFree(cur_frame);
}
for (c = dmux->chunks_; c != NULL;) {
Chunk* const cur_chunk = c;
c = c->next_;
WebPSafeFree(cur_chunk);
}
WebPSafeFree(dmux);
}
// -----------------------------------------------------------------------------
uint32_t WebPDemuxGetI(const WebPDemuxer* dmux, WebPFormatFeature feature) {
if (dmux == NULL) return 0;
switch (feature) {
case WEBP_FF_FORMAT_FLAGS: return dmux->feature_flags_;
case WEBP_FF_CANVAS_WIDTH: return (uint32_t)dmux->canvas_width_;
case WEBP_FF_CANVAS_HEIGHT: return (uint32_t)dmux->canvas_height_;
case WEBP_FF_LOOP_COUNT: return (uint32_t)dmux->loop_count_;
case WEBP_FF_BACKGROUND_COLOR: return dmux->bgcolor_;
case WEBP_FF_FRAME_COUNT: return (uint32_t)dmux->num_frames_;
}
return 0;
}
// -----------------------------------------------------------------------------
// Frame iteration
static const Frame* GetFrame(const WebPDemuxer* const dmux, int frame_num) {
const Frame* f;
for (f = dmux->frames_; f != NULL; f = f->next_) {
if (frame_num == f->frame_num_) break;
}
return f;
}
static const uint8_t* GetFramePayload(const uint8_t* const mem_buf,
const Frame* const frame,
size_t* const data_size) {
*data_size = 0;
if (frame != NULL) {
const ChunkData* const image = frame->img_components_;
const ChunkData* const alpha = frame->img_components_ + 1;
size_t start_offset = image->offset_;
*data_size = image->size_;
// if alpha exists it precedes image, update the size allowing for
// intervening chunks.
if (alpha->size_ > 0) {
const size_t inter_size = (image->offset_ > 0)
? image->offset_ - (alpha->offset_ + alpha->size_)
: 0;
start_offset = alpha->offset_;
*data_size += alpha->size_ + inter_size;
}
return mem_buf + start_offset;
}
return NULL;
}
// Create a whole 'frame' from VP8 (+ alpha) or lossless.
static int SynthesizeFrame(const WebPDemuxer* const dmux,
const Frame* const frame,
WebPIterator* const iter) {
const uint8_t* const mem_buf = dmux->mem_.buf_;
size_t payload_size = 0;
const uint8_t* const payload = GetFramePayload(mem_buf, frame, &payload_size);
if (payload == NULL) return 0;
assert(frame != NULL);
iter->frame_num = frame->frame_num_;
iter->num_frames = dmux->num_frames_;
iter->x_offset = frame->x_offset_;
iter->y_offset = frame->y_offset_;
iter->width = frame->width_;
iter->height = frame->height_;
iter->has_alpha = frame->has_alpha_;
iter->duration = frame->duration_;
iter->dispose_method = frame->dispose_method_;
iter->blend_method = frame->blend_method_;
iter->complete = frame->complete_;
iter->fragment.bytes = payload;
iter->fragment.size = payload_size;
return 1;
}
static int SetFrame(int frame_num, WebPIterator* const iter) {
const Frame* frame;
const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
if (dmux == NULL || frame_num < 0) return 0;
if (frame_num > dmux->num_frames_) return 0;
if (frame_num == 0) frame_num = dmux->num_frames_;
frame = GetFrame(dmux, frame_num);
if (frame == NULL) return 0;
return SynthesizeFrame(dmux, frame, iter);
}
int WebPDemuxGetFrame(const WebPDemuxer* dmux, int frame, WebPIterator* iter) {
if (iter == NULL) return 0;
memset(iter, 0, sizeof(*iter));
iter->private_ = (void*)dmux;
return SetFrame(frame, iter);
}
int WebPDemuxNextFrame(WebPIterator* iter) {
if (iter == NULL) return 0;
return SetFrame(iter->frame_num + 1, iter);
}
int WebPDemuxPrevFrame(WebPIterator* iter) {
if (iter == NULL) return 0;
if (iter->frame_num <= 1) return 0;
return SetFrame(iter->frame_num - 1, iter);
}
void WebPDemuxReleaseIterator(WebPIterator* iter) {
(void)iter;
}
// -----------------------------------------------------------------------------
// Chunk iteration
static int ChunkCount(const WebPDemuxer* const dmux, const char fourcc[4]) {
const uint8_t* const mem_buf = dmux->mem_.buf_;
const Chunk* c;
int count = 0;
for (c = dmux->chunks_; c != NULL; c = c->next_) {
const uint8_t* const header = mem_buf + c->data_.offset_;
if (!memcmp(header, fourcc, TAG_SIZE)) ++count;
}
return count;
}
static const Chunk* GetChunk(const WebPDemuxer* const dmux,
const char fourcc[4], int chunk_num) {
const uint8_t* const mem_buf = dmux->mem_.buf_;
const Chunk* c;
int count = 0;
for (c = dmux->chunks_; c != NULL; c = c->next_) {
const uint8_t* const header = mem_buf + c->data_.offset_;
if (!memcmp(header, fourcc, TAG_SIZE)) ++count;
if (count == chunk_num) break;
}
return c;
}
static int SetChunk(const char fourcc[4], int chunk_num,
WebPChunkIterator* const iter) {
const WebPDemuxer* const dmux = (WebPDemuxer*)iter->private_;
int count;
if (dmux == NULL || fourcc == NULL || chunk_num < 0) return 0;
count = ChunkCount(dmux, fourcc);
if (count == 0) return 0;
if (chunk_num == 0) chunk_num = count;
if (chunk_num <= count) {
const uint8_t* const mem_buf = dmux->mem_.buf_;
const Chunk* const chunk = GetChunk(dmux, fourcc, chunk_num);
iter->chunk.bytes = mem_buf + chunk->data_.offset_ + CHUNK_HEADER_SIZE;
iter->chunk.size = chunk->data_.size_ - CHUNK_HEADER_SIZE;
iter->num_chunks = count;
iter->chunk_num = chunk_num;
return 1;
}
return 0;
}
int WebPDemuxGetChunk(const WebPDemuxer* dmux,
const char fourcc[4], int chunk_num,
WebPChunkIterator* iter) {
if (iter == NULL) return 0;
memset(iter, 0, sizeof(*iter));
iter->private_ = (void*)dmux;
return SetChunk(fourcc, chunk_num, iter);
}
int WebPDemuxNextChunk(WebPChunkIterator* iter) {
if (iter != NULL) {
const char* const fourcc =
(const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
return SetChunk(fourcc, iter->chunk_num + 1, iter);
}
return 0;
}
int WebPDemuxPrevChunk(WebPChunkIterator* iter) {
if (iter != NULL && iter->chunk_num > 1) {
const char* const fourcc =
(const char*)iter->chunk.bytes - CHUNK_HEADER_SIZE;
return SetChunk(fourcc, iter->chunk_num - 1, iter);
}
return 0;
}
void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter) {
(void)iter;
}

View File

@ -0,0 +1,480 @@
// Copyright 2013 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Utilities for processing transparent channel.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include "dsp_dsp.h"
// Tables can be faster on some platform but incur some extra binary size (~2k).
#if !defined(USE_TABLES_FOR_ALPHA_MULT)
#define USE_TABLES_FOR_ALPHA_MULT 0 // ALTERNATE_CODE
#endif
// -----------------------------------------------------------------------------
#define MFIX 24 // 24bit fixed-point arithmetic
#define HALF ((1u << MFIX) >> 1)
#define KINV_255 ((1u << MFIX) / 255u)
static uint32_t Mult(uint8_t x, uint32_t mult) {
const uint32_t v = (x * mult + HALF) >> MFIX;
assert(v <= 255); // <- 24bit precision is enough to ensure that.
return v;
}
#if (USE_TABLES_FOR_ALPHA_MULT == 1)
static const uint32_t kMultTables[2][256] = {
{ // (255u << MFIX) / alpha
0x00000000, 0xff000000, 0x7f800000, 0x55000000, 0x3fc00000, 0x33000000,
0x2a800000, 0x246db6db, 0x1fe00000, 0x1c555555, 0x19800000, 0x172e8ba2,
0x15400000, 0x139d89d8, 0x1236db6d, 0x11000000, 0x0ff00000, 0x0f000000,
0x0e2aaaaa, 0x0d6bca1a, 0x0cc00000, 0x0c249249, 0x0b9745d1, 0x0b1642c8,
0x0aa00000, 0x0a333333, 0x09cec4ec, 0x0971c71c, 0x091b6db6, 0x08cb08d3,
0x08800000, 0x0839ce73, 0x07f80000, 0x07ba2e8b, 0x07800000, 0x07492492,
0x07155555, 0x06e45306, 0x06b5e50d, 0x0689d89d, 0x06600000, 0x063831f3,
0x06124924, 0x05ee23b8, 0x05cba2e8, 0x05aaaaaa, 0x058b2164, 0x056cefa8,
0x05500000, 0x05343eb1, 0x05199999, 0x05000000, 0x04e76276, 0x04cfb2b7,
0x04b8e38e, 0x04a2e8ba, 0x048db6db, 0x0479435e, 0x04658469, 0x045270d0,
0x04400000, 0x042e29f7, 0x041ce739, 0x040c30c3, 0x03fc0000, 0x03ec4ec4,
0x03dd1745, 0x03ce540f, 0x03c00000, 0x03b21642, 0x03a49249, 0x03976fc6,
0x038aaaaa, 0x037e3f1f, 0x03722983, 0x03666666, 0x035af286, 0x034fcace,
0x0344ec4e, 0x033a5440, 0x03300000, 0x0325ed09, 0x031c18f9, 0x0312818a,
0x03092492, 0x03000000, 0x02f711dc, 0x02ee5846, 0x02e5d174, 0x02dd7baf,
0x02d55555, 0x02cd5cd5, 0x02c590b2, 0x02bdef7b, 0x02b677d4, 0x02af286b,
0x02a80000, 0x02a0fd5c, 0x029a1f58, 0x029364d9, 0x028ccccc, 0x0286562d,
0x02800000, 0x0279c952, 0x0273b13b, 0x026db6db, 0x0267d95b, 0x026217ec,
0x025c71c7, 0x0256e62a, 0x0251745d, 0x024c1bac, 0x0246db6d, 0x0241b2f9,
0x023ca1af, 0x0237a6f4, 0x0232c234, 0x022df2df, 0x02293868, 0x02249249,
0x02200000, 0x021b810e, 0x021714fb, 0x0212bb51, 0x020e739c, 0x020a3d70,
0x02061861, 0x02020408, 0x01fe0000, 0x01fa0be8, 0x01f62762, 0x01f25213,
0x01ee8ba2, 0x01ead3ba, 0x01e72a07, 0x01e38e38, 0x01e00000, 0x01dc7f10,
0x01d90b21, 0x01d5a3e9, 0x01d24924, 0x01cefa8d, 0x01cbb7e3, 0x01c880e5,
0x01c55555, 0x01c234f7, 0x01bf1f8f, 0x01bc14e5, 0x01b914c1, 0x01b61eed,
0x01b33333, 0x01b05160, 0x01ad7943, 0x01aaaaaa, 0x01a7e567, 0x01a5294a,
0x01a27627, 0x019fcbd2, 0x019d2a20, 0x019a90e7, 0x01980000, 0x01957741,
0x0192f684, 0x01907da4, 0x018e0c7c, 0x018ba2e8, 0x018940c5, 0x0186e5f0,
0x01849249, 0x018245ae, 0x01800000, 0x017dc11f, 0x017b88ee, 0x0179574e,
0x01772c23, 0x01750750, 0x0172e8ba, 0x0170d045, 0x016ebdd7, 0x016cb157,
0x016aaaaa, 0x0168a9b9, 0x0166ae6a, 0x0164b8a7, 0x0162c859, 0x0160dd67,
0x015ef7bd, 0x015d1745, 0x015b3bea, 0x01596596, 0x01579435, 0x0155c7b4,
0x01540000, 0x01523d03, 0x01507eae, 0x014ec4ec, 0x014d0fac, 0x014b5edc,
0x0149b26c, 0x01480a4a, 0x01466666, 0x0144c6af, 0x01432b16, 0x0141938b,
0x01400000, 0x013e7063, 0x013ce4a9, 0x013b5cc0, 0x0139d89d, 0x01385830,
0x0136db6d, 0x01356246, 0x0133ecad, 0x01327a97, 0x01310bf6, 0x012fa0be,
0x012e38e3, 0x012cd459, 0x012b7315, 0x012a150a, 0x0128ba2e, 0x01276276,
0x01260dd6, 0x0124bc44, 0x01236db6, 0x01222222, 0x0120d97c, 0x011f93bc,
0x011e50d7, 0x011d10c4, 0x011bd37a, 0x011a98ef, 0x0119611a, 0x01182bf2,
0x0116f96f, 0x0115c988, 0x01149c34, 0x0113716a, 0x01124924, 0x01112358,
0x01100000, 0x010edf12, 0x010dc087, 0x010ca458, 0x010b8a7d, 0x010a72f0,
0x01095da8, 0x01084a9f, 0x010739ce, 0x01062b2e, 0x01051eb8, 0x01041465,
0x01030c30, 0x01020612, 0x01010204, 0x01000000 },
{ // alpha * KINV_255
0x00000000, 0x00010101, 0x00020202, 0x00030303, 0x00040404, 0x00050505,
0x00060606, 0x00070707, 0x00080808, 0x00090909, 0x000a0a0a, 0x000b0b0b,
0x000c0c0c, 0x000d0d0d, 0x000e0e0e, 0x000f0f0f, 0x00101010, 0x00111111,
0x00121212, 0x00131313, 0x00141414, 0x00151515, 0x00161616, 0x00171717,
0x00181818, 0x00191919, 0x001a1a1a, 0x001b1b1b, 0x001c1c1c, 0x001d1d1d,
0x001e1e1e, 0x001f1f1f, 0x00202020, 0x00212121, 0x00222222, 0x00232323,
0x00242424, 0x00252525, 0x00262626, 0x00272727, 0x00282828, 0x00292929,
0x002a2a2a, 0x002b2b2b, 0x002c2c2c, 0x002d2d2d, 0x002e2e2e, 0x002f2f2f,
0x00303030, 0x00313131, 0x00323232, 0x00333333, 0x00343434, 0x00353535,
0x00363636, 0x00373737, 0x00383838, 0x00393939, 0x003a3a3a, 0x003b3b3b,
0x003c3c3c, 0x003d3d3d, 0x003e3e3e, 0x003f3f3f, 0x00404040, 0x00414141,
0x00424242, 0x00434343, 0x00444444, 0x00454545, 0x00464646, 0x00474747,
0x00484848, 0x00494949, 0x004a4a4a, 0x004b4b4b, 0x004c4c4c, 0x004d4d4d,
0x004e4e4e, 0x004f4f4f, 0x00505050, 0x00515151, 0x00525252, 0x00535353,
0x00545454, 0x00555555, 0x00565656, 0x00575757, 0x00585858, 0x00595959,
0x005a5a5a, 0x005b5b5b, 0x005c5c5c, 0x005d5d5d, 0x005e5e5e, 0x005f5f5f,
0x00606060, 0x00616161, 0x00626262, 0x00636363, 0x00646464, 0x00656565,
0x00666666, 0x00676767, 0x00686868, 0x00696969, 0x006a6a6a, 0x006b6b6b,
0x006c6c6c, 0x006d6d6d, 0x006e6e6e, 0x006f6f6f, 0x00707070, 0x00717171,
0x00727272, 0x00737373, 0x00747474, 0x00757575, 0x00767676, 0x00777777,
0x00787878, 0x00797979, 0x007a7a7a, 0x007b7b7b, 0x007c7c7c, 0x007d7d7d,
0x007e7e7e, 0x007f7f7f, 0x00808080, 0x00818181, 0x00828282, 0x00838383,
0x00848484, 0x00858585, 0x00868686, 0x00878787, 0x00888888, 0x00898989,
0x008a8a8a, 0x008b8b8b, 0x008c8c8c, 0x008d8d8d, 0x008e8e8e, 0x008f8f8f,
0x00909090, 0x00919191, 0x00929292, 0x00939393, 0x00949494, 0x00959595,
0x00969696, 0x00979797, 0x00989898, 0x00999999, 0x009a9a9a, 0x009b9b9b,
0x009c9c9c, 0x009d9d9d, 0x009e9e9e, 0x009f9f9f, 0x00a0a0a0, 0x00a1a1a1,
0x00a2a2a2, 0x00a3a3a3, 0x00a4a4a4, 0x00a5a5a5, 0x00a6a6a6, 0x00a7a7a7,
0x00a8a8a8, 0x00a9a9a9, 0x00aaaaaa, 0x00ababab, 0x00acacac, 0x00adadad,
0x00aeaeae, 0x00afafaf, 0x00b0b0b0, 0x00b1b1b1, 0x00b2b2b2, 0x00b3b3b3,
0x00b4b4b4, 0x00b5b5b5, 0x00b6b6b6, 0x00b7b7b7, 0x00b8b8b8, 0x00b9b9b9,
0x00bababa, 0x00bbbbbb, 0x00bcbcbc, 0x00bdbdbd, 0x00bebebe, 0x00bfbfbf,
0x00c0c0c0, 0x00c1c1c1, 0x00c2c2c2, 0x00c3c3c3, 0x00c4c4c4, 0x00c5c5c5,
0x00c6c6c6, 0x00c7c7c7, 0x00c8c8c8, 0x00c9c9c9, 0x00cacaca, 0x00cbcbcb,
0x00cccccc, 0x00cdcdcd, 0x00cecece, 0x00cfcfcf, 0x00d0d0d0, 0x00d1d1d1,
0x00d2d2d2, 0x00d3d3d3, 0x00d4d4d4, 0x00d5d5d5, 0x00d6d6d6, 0x00d7d7d7,
0x00d8d8d8, 0x00d9d9d9, 0x00dadada, 0x00dbdbdb, 0x00dcdcdc, 0x00dddddd,
0x00dedede, 0x00dfdfdf, 0x00e0e0e0, 0x00e1e1e1, 0x00e2e2e2, 0x00e3e3e3,
0x00e4e4e4, 0x00e5e5e5, 0x00e6e6e6, 0x00e7e7e7, 0x00e8e8e8, 0x00e9e9e9,
0x00eaeaea, 0x00ebebeb, 0x00ececec, 0x00ededed, 0x00eeeeee, 0x00efefef,
0x00f0f0f0, 0x00f1f1f1, 0x00f2f2f2, 0x00f3f3f3, 0x00f4f4f4, 0x00f5f5f5,
0x00f6f6f6, 0x00f7f7f7, 0x00f8f8f8, 0x00f9f9f9, 0x00fafafa, 0x00fbfbfb,
0x00fcfcfc, 0x00fdfdfd, 0x00fefefe, 0x00ffffff }
};
static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
return kMultTables[!inverse][a];
}
#else
static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
return inverse ? (255u << MFIX) / a : a * KINV_255;
}
#endif // USE_TABLES_FOR_ALPHA_MULT
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
const uint32_t argb = ptr[x];
if (argb < 0xff000000u) { // alpha < 255
if (argb <= 0x00ffffffu) { // alpha == 0
ptr[x] = 0;
} else {
const uint32_t alpha = (argb >> 24) & 0xff;
const uint32_t scale = GetScale(alpha, inverse);
uint32_t out = argb & 0xff000000u;
out |= Mult(argb >> 0, scale) << 0;
out |= Mult(argb >> 8, scale) << 8;
out |= Mult(argb >> 16, scale) << 16;
ptr[x] = out;
}
}
}
}
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse) {
int x;
for (x = 0; x < width; ++x) {
const uint32_t a = alpha[x];
if (a != 255) {
if (a == 0) {
ptr[x] = 0;
} else {
const uint32_t scale = GetScale(a, inverse);
ptr[x] = Mult(ptr[x], scale);
}
}
}
}
#undef KINV_255
#undef HALF
#undef MFIX
void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse);
//------------------------------------------------------------------------------
// Generic per-plane calls
void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
int inverse) {
int n;
for (n = 0; n < num_rows; ++n) {
WebPMultARGBRow((uint32_t*)ptr, width, inverse);
ptr += stride;
}
}
void WebPMultRows(uint8_t* ptr, int stride,
const uint8_t* alpha, int alpha_stride,
int width, int num_rows, int inverse) {
int n;
for (n = 0; n < num_rows; ++n) {
WebPMultRow(ptr, alpha, width, inverse);
ptr += stride;
alpha += alpha_stride;
}
}
//------------------------------------------------------------------------------
// Premultiplied modes
// non dithered-modes
// (x * a * 32897) >> 23 is bit-wise equivalent to (int)(x * a / 255.)
// for all 8bit x or a. For bit-wise equivalence to (int)(x * a / 255. + .5),
// one can use instead: (x * a * 65793 + (1 << 23)) >> 24
#if 1 // (int)(x * a / 255.)
#define MULTIPLIER(a) ((a) * 32897U)
#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
#else // (int)(x * a / 255. + .5)
#define MULTIPLIER(a) ((a) * 65793U)
#define PREMULTIPLY(x, m) (((x) * (m) + (1U << 23)) >> 24)
#endif
#if !WEBP_NEON_OMIT_C_CODE
static void ApplyAlphaMultiply_C(uint8_t* rgba, int alpha_first,
int w, int h, int stride) {
while (h-- > 0) {
uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
int i;
for (i = 0; i < w; ++i) {
const uint32_t a = alpha[4 * i];
if (a != 0xff) {
const uint32_t mult = MULTIPLIER(a);
rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
}
}
rgba += stride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MULTIPLIER
#undef PREMULTIPLY
// rgbA4444
#define MULTIPLIER(a) ((a) * 0x1111) // 0x1111 ~= (1 << 16) / 15
static WEBP_INLINE uint8_t dither_hi(uint8_t x) {
return (x & 0xf0) | (x >> 4);
}
static WEBP_INLINE uint8_t dither_lo(uint8_t x) {
return (x & 0x0f) | (x << 4);
}
static WEBP_INLINE uint8_t multiply(uint8_t x, uint32_t m) {
return (x * m) >> 16;
}
static WEBP_INLINE void ApplyAlphaMultiply4444_C(uint8_t* rgba4444,
int w, int h, int stride,
int rg_byte_pos /* 0 or 1 */) {
while (h-- > 0) {
int i;
for (i = 0; i < w; ++i) {
const uint32_t rg = rgba4444[2 * i + rg_byte_pos];
const uint32_t ba = rgba4444[2 * i + (rg_byte_pos ^ 1)];
const uint8_t a = ba & 0x0f;
const uint32_t mult = MULTIPLIER(a);
const uint8_t r = multiply(dither_hi(rg), mult);
const uint8_t g = multiply(dither_lo(rg), mult);
const uint8_t b = multiply(dither_hi(ba), mult);
rgba4444[2 * i + rg_byte_pos] = (r & 0xf0) | ((g >> 4) & 0x0f);
rgba4444[2 * i + (rg_byte_pos ^ 1)] = (b & 0xf0) | a;
}
rgba4444 += stride;
}
}
#undef MULTIPLIER
static void ApplyAlphaMultiply_16b_C(uint8_t* rgba4444,
int w, int h, int stride) {
#if (WEBP_SWAP_16BIT_CSP == 1)
ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 1);
#else
ApplyAlphaMultiply4444_C(rgba4444, w, h, stride, 0);
#endif
}
#if !WEBP_NEON_OMIT_C_CODE
static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
uint32_t alpha_mask = 0xff;
int i, j;
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
const uint32_t alpha_value = alpha[i];
dst[4 * i] = alpha_value;
alpha_mask &= alpha_value;
}
alpha += alpha_stride;
dst += dst_stride;
}
return (alpha_mask != 0xff);
}
static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
int i, j;
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
dst[i] = alpha[i] << 8; // leave A/R/B channels zero'd.
}
alpha += alpha_stride;
dst += dst_stride;
}
}
static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
uint8_t alpha_mask = 0xff;
int i, j;
for (j = 0; j < height; ++j) {
for (i = 0; i < width; ++i) {
const uint8_t alpha_value = argb[4 * i];
alpha[i] = alpha_value;
alpha_mask &= alpha_value;
}
argb += argb_stride;
alpha += alpha_stride;
}
return (alpha_mask == 0xff);
}
static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
int i;
for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
static int HasAlpha8b_C(const uint8_t* src, int length) {
while (length-- > 0) if (*src++ != 0xff) return 1;
return 0;
}
static int HasAlpha32b_C(const uint8_t* src, int length) {
int x;
for (x = 0; length-- > 0; x += 4) if (src[x] != 0xff) return 1;
return 0;
}
static void AlphaReplace_C(uint32_t* src, int length, uint32_t color) {
int x;
for (x = 0; x < length; ++x) if ((src[x] >> 24) == 0) src[x] = color;
}
//------------------------------------------------------------------------------
// Simple channel manipulations.
static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
}
#ifdef WORDS_BIGENDIAN
static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, uint32_t* out) {
int i;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
}
}
#endif
static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out) {
int i, offset = 0;
for (i = 0; i < len; ++i) {
out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
offset += step;
}
}
void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
#ifdef WORDS_BIGENDIAN
void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
const uint8_t* b, int, uint32_t*);
#endif
void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
int (*WebPHasAlpha8b)(const uint8_t* src, int length);
int (*WebPHasAlpha32b)(const uint8_t* src, int length);
void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
//------------------------------------------------------------------------------
// Init function
extern void WebPInitAlphaProcessingMIPSdspR2(void);
extern void WebPInitAlphaProcessingSSE2(void);
extern void WebPInitAlphaProcessingSSE41(void);
extern void WebPInitAlphaProcessingNEON(void);
WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
WebPMultARGBRow = WebPMultARGBRow_C;
WebPMultRow = WebPMultRow_C;
WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;
#ifdef WORDS_BIGENDIAN
WebPPackARGB = PackARGB_C;
#endif
WebPPackRGB = PackRGB_C;
#if !WEBP_NEON_OMIT_C_CODE
WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
WebPDispatchAlpha = DispatchAlpha_C;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
WebPExtractAlpha = ExtractAlpha_C;
WebPExtractGreen = ExtractGreen_C;
#endif
WebPHasAlpha8b = HasAlpha8b_C;
WebPHasAlpha32b = HasAlpha32b_C;
WebPAlphaReplace = AlphaReplace_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitAlphaProcessingSSE2();
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitAlphaProcessingSSE41();
}
#endif
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitAlphaProcessingMIPSdspR2();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitAlphaProcessingNEON();
}
#endif
assert(WebPMultARGBRow != NULL);
assert(WebPMultRow != NULL);
assert(WebPApplyAlphaMultiply != NULL);
assert(WebPApplyAlphaMultiply4444 != NULL);
assert(WebPDispatchAlpha != NULL);
assert(WebPDispatchAlphaToGreen != NULL);
assert(WebPExtractAlpha != NULL);
assert(WebPExtractGreen != NULL);
#ifdef WORDS_BIGENDIAN
assert(WebPPackARGB != NULL);
#endif
assert(WebPPackRGB != NULL);
assert(WebPHasAlpha8b != NULL);
assert(WebPHasAlpha32b != NULL);
assert(WebPAlphaReplace != NULL);
}

View File

@ -0,0 +1,228 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Utilities for processing transparent channel.
//
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
static int DispatchAlpha_MIPSdspR2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
uint32_t alpha_mask = 0xffffffff;
int i, j, temp0;
for (j = 0; j < height; ++j) {
uint8_t* pdst = dst;
const uint8_t* palpha = alpha;
for (i = 0; i < (width >> 2); ++i) {
int temp1, temp2, temp3;
__asm__ volatile (
"ulw %[temp0], 0(%[palpha]) \n\t"
"addiu %[palpha], %[palpha], 4 \n\t"
"addiu %[pdst], %[pdst], 16 \n\t"
"srl %[temp1], %[temp0], 8 \n\t"
"srl %[temp2], %[temp0], 16 \n\t"
"srl %[temp3], %[temp0], 24 \n\t"
"and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
"sb %[temp0], -16(%[pdst]) \n\t"
"sb %[temp1], -12(%[pdst]) \n\t"
"sb %[temp2], -8(%[pdst]) \n\t"
"sb %[temp3], -4(%[pdst]) \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
[alpha_mask]"+r"(alpha_mask)
:
: "memory"
);
}
for (i = 0; i < (width & 3); ++i) {
__asm__ volatile (
"lbu %[temp0], 0(%[palpha]) \n\t"
"addiu %[palpha], %[palpha], 1 \n\t"
"sb %[temp0], 0(%[pdst]) \n\t"
"and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
"addiu %[pdst], %[pdst], 4 \n\t"
: [temp0]"=&r"(temp0), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
[alpha_mask]"+r"(alpha_mask)
:
: "memory"
);
}
alpha += alpha_stride;
dst += dst_stride;
}
__asm__ volatile (
"ext %[temp0], %[alpha_mask], 0, 16 \n\t"
"srl %[alpha_mask], %[alpha_mask], 16 \n\t"
"and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
"ext %[temp0], %[alpha_mask], 0, 8 \n\t"
"srl %[alpha_mask], %[alpha_mask], 8 \n\t"
"and %[alpha_mask], %[alpha_mask], %[temp0] \n\t"
: [temp0]"=&r"(temp0), [alpha_mask]"+r"(alpha_mask)
:
);
return (alpha_mask != 0xff);
}
static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
int inverse) {
int x;
const uint32_t c_00ffffff = 0x00ffffffu;
const uint32_t c_ff000000 = 0xff000000u;
const uint32_t c_8000000 = 0x00800000u;
const uint32_t c_8000080 = 0x00800080u;
for (x = 0; x < width; ++x) {
const uint32_t argb = ptr[x];
if (argb < 0xff000000u) { // alpha < 255
if (argb <= 0x00ffffffu) { // alpha == 0
ptr[x] = 0;
} else {
int temp0, temp1, temp2, temp3, alpha;
__asm__ volatile (
"srl %[alpha], %[argb], 24 \n\t"
"replv.qb %[temp0], %[alpha] \n\t"
"and %[temp0], %[temp0], %[c_00ffffff] \n\t"
"beqz %[inverse], 0f \n\t"
"divu $zero, %[c_ff000000], %[alpha] \n\t"
"mflo %[temp0] \n\t"
"0: \n\t"
"andi %[temp1], %[argb], 0xff \n\t"
"ext %[temp2], %[argb], 8, 8 \n\t"
"ext %[temp3], %[argb], 16, 8 \n\t"
"mul %[temp1], %[temp1], %[temp0] \n\t"
"mul %[temp2], %[temp2], %[temp0] \n\t"
"mul %[temp3], %[temp3], %[temp0] \n\t"
"precrq.ph.w %[temp1], %[temp2], %[temp1] \n\t"
"addu %[temp3], %[temp3], %[c_8000000] \n\t"
"addu %[temp1], %[temp1], %[c_8000080] \n\t"
"precrq.ph.w %[temp3], %[argb], %[temp3] \n\t"
"precrq.qb.ph %[temp1], %[temp3], %[temp1] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [alpha]"=&r"(alpha)
: [inverse]"r"(inverse), [c_00ffffff]"r"(c_00ffffff),
[c_8000000]"r"(c_8000000), [c_8000080]"r"(c_8000080),
[c_ff000000]"r"(c_ff000000), [argb]"r"(argb)
: "memory", "hi", "lo"
);
ptr[x] = temp1;
}
}
}
}
#ifdef WORDS_BIGENDIAN
static void PackARGB_MIPSdspR2(const uint8_t* a, const uint8_t* r,
const uint8_t* g, const uint8_t* b, int len,
uint32_t* out) {
int temp0, temp1, temp2, temp3, offset;
const int rest = len & 1;
const uint32_t* const loop_end = out + len - rest;
const int step = 4;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[a]) \n\t"
"lbux %[temp1], %[offset](%[r]) \n\t"
"lbux %[temp2], %[offset](%[g]) \n\t"
"lbux %[temp3], %[offset](%[b]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[a]) \n\t"
"lbux %[temp1], %[offset](%[r]) \n\t"
"lbux %[temp2], %[offset](%[g]) \n\t"
"lbux %[temp3], %[offset](%[b]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp1], %[temp3] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
#endif // WORDS_BIGENDIAN
static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
const uint8_t* b, int len, int step,
uint32_t* out) {
int temp0, temp1, temp2, offset;
const int rest = len & 1;
const int a = 0xff;
const uint32_t* const loop_end = out + len - rest;
__asm__ volatile (
"xor %[offset], %[offset], %[offset] \n\t"
"beq %[loop_end], %[out], 0f \n\t"
"2: \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"addiu %[out], %[out], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], -4(%[out]) \n\t"
"addu %[offset], %[offset], %[step] \n\t"
"bne %[loop_end], %[out], 2b \n\t"
"0: \n\t"
"beq %[rest], $zero, 1f \n\t"
"lbux %[temp0], %[offset](%[r]) \n\t"
"lbux %[temp1], %[offset](%[g]) \n\t"
"lbux %[temp2], %[offset](%[b]) \n\t"
"ins %[temp0], %[a], 16, 16 \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp2] \n\t"
"sw %[temp0], 0(%[out]) \n\t"
"1: \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[offset]"=&r"(offset), [out]"+&r"(out)
: [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
[loop_end]"r"(loop_end), [rest]"r"(rest)
: "memory"
);
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitAlphaProcessingMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
WebPMultARGBRow = MultARGBRow_MIPSdspR2;
#ifdef WORDS_BIGENDIAN
WebPPackARGB = PackARGB_MIPSdspR2;
#endif
WebPPackRGB = PackRGB_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View File

@ -0,0 +1,191 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Utilities for processing transparent channel, NEON version.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_NEON)
#include "dsp_neon.h"
//------------------------------------------------------------------------------
#define MULTIPLIER(a) ((a) * 0x8081)
#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
#define MULTIPLY_BY_ALPHA(V, ALPHA, OTHER) do { \
const uint8x8_t alpha = (V).val[(ALPHA)]; \
const uint16x8_t r1 = vmull_u8((V).val[1], alpha); \
const uint16x8_t g1 = vmull_u8((V).val[2], alpha); \
const uint16x8_t b1 = vmull_u8((V).val[(OTHER)], alpha); \
/* we use: v / 255 = (v + 1 + (v >> 8)) >> 8 */ \
const uint16x8_t r2 = vsraq_n_u16(r1, r1, 8); \
const uint16x8_t g2 = vsraq_n_u16(g1, g1, 8); \
const uint16x8_t b2 = vsraq_n_u16(b1, b1, 8); \
const uint16x8_t r3 = vaddq_u16(r2, kOne); \
const uint16x8_t g3 = vaddq_u16(g2, kOne); \
const uint16x8_t b3 = vaddq_u16(b2, kOne); \
(V).val[1] = vshrn_n_u16(r3, 8); \
(V).val[2] = vshrn_n_u16(g3, 8); \
(V).val[(OTHER)] = vshrn_n_u16(b3, 8); \
} while (0)
static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
int w, int h, int stride) {
const uint16x8_t kOne = vdupq_n_u16(1u);
while (h-- > 0) {
uint32_t* const rgbx = (uint32_t*)rgba;
int i = 0;
if (alpha_first) {
for (; i + 8 <= w; i += 8) {
// load aaaa...|rrrr...|gggg...|bbbb...
uint8x8x4_t RGBX = vld4_u8((const uint8_t*)(rgbx + i));
MULTIPLY_BY_ALPHA(RGBX, 0, 3);
vst4_u8((uint8_t*)(rgbx + i), RGBX);
}
} else {
for (; i + 8 <= w; i += 8) {
uint8x8x4_t RGBX = vld4_u8((const uint8_t*)(rgbx + i));
MULTIPLY_BY_ALPHA(RGBX, 3, 0);
vst4_u8((uint8_t*)(rgbx + i), RGBX);
}
}
// Finish with left-overs.
for (; i < w; ++i) {
uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
const uint32_t a = alpha[4 * i];
if (a != 0xff) {
const uint32_t mult = MULTIPLIER(a);
rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
}
}
rgba += stride;
}
}
#undef MULTIPLY_BY_ALPHA
#undef MULTIPLIER
#undef PREMULTIPLY
//------------------------------------------------------------------------------
static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
uint32_t alpha_mask = 0xffffffffu;
uint8x8_t mask8 = vdup_n_u8(0xff);
uint32_t tmp[2];
int i, j;
for (j = 0; j < height; ++j) {
// We don't know if alpha is first or last in dst[] (depending on rgbA/Argb
// mode). So we must be sure dst[4*i + 8 - 1] is writable for the store.
// Hence the test with 'width - 1' instead of just 'width'.
for (i = 0; i + 8 <= width - 1; i += 8) {
uint8x8x4_t rgbX = vld4_u8((const uint8_t*)(dst + 4 * i));
const uint8x8_t alphas = vld1_u8(alpha + i);
rgbX.val[0] = alphas;
vst4_u8((uint8_t*)(dst + 4 * i), rgbX);
mask8 = vand_u8(mask8, alphas);
}
for (; i < width; ++i) {
const uint32_t alpha_value = alpha[i];
dst[4 * i] = alpha_value;
alpha_mask &= alpha_value;
}
alpha += alpha_stride;
dst += dst_stride;
}
vst1_u8((uint8_t*)tmp, mask8);
alpha_mask &= tmp[0];
alpha_mask &= tmp[1];
return (alpha_mask != 0xffffffffu);
}
static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
int i, j;
uint8x8x4_t greens; // leave A/R/B channels zero'd.
greens.val[0] = vdup_n_u8(0);
greens.val[2] = vdup_n_u8(0);
greens.val[3] = vdup_n_u8(0);
for (j = 0; j < height; ++j) {
for (i = 0; i + 8 <= width; i += 8) {
greens.val[1] = vld1_u8(alpha + i);
vst4_u8((uint8_t*)(dst + i), greens);
}
for (; i < width; ++i) dst[i] = alpha[i] << 8;
alpha += alpha_stride;
dst += dst_stride;
}
}
static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
uint32_t alpha_mask = 0xffffffffu;
uint8x8_t mask8 = vdup_n_u8(0xff);
uint32_t tmp[2];
int i, j;
for (j = 0; j < height; ++j) {
// We don't know if alpha is first or last in dst[] (depending on rgbA/Argb
// mode). So we must be sure dst[4*i + 8 - 1] is writable for the store.
// Hence the test with 'width - 1' instead of just 'width'.
for (i = 0; i + 8 <= width - 1; i += 8) {
const uint8x8x4_t rgbX = vld4_u8((const uint8_t*)(argb + 4 * i));
const uint8x8_t alphas = rgbX.val[0];
vst1_u8((uint8_t*)(alpha + i), alphas);
mask8 = vand_u8(mask8, alphas);
}
for (; i < width; ++i) {
alpha[i] = argb[4 * i];
alpha_mask &= alpha[i];
}
argb += argb_stride;
alpha += alpha_stride;
}
vst1_u8((uint8_t*)tmp, mask8);
alpha_mask &= tmp[0];
alpha_mask &= tmp[1];
return (alpha_mask == 0xffffffffu);
}
static void ExtractGreen_NEON(const uint32_t* argb,
uint8_t* alpha, int size) {
int i;
for (i = 0; i + 16 <= size; i += 16) {
const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));
const uint8x16_t greens = rgbX.val[1];
vst1q_u8(alpha + i, greens);
}
for (; i < size; ++i) alpha[i] = (argb[i] >> 8) & 0xff;
}
//------------------------------------------------------------------------------
extern void WebPInitAlphaProcessingNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingNEON(void) {
WebPApplyAlphaMultiply = ApplyAlphaMultiply_NEON;
WebPDispatchAlpha = DispatchAlpha_NEON;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen_NEON;
WebPExtractAlpha = ExtractAlpha_NEON;
WebPExtractGreen = ExtractGreen_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingNEON)
#endif // WEBP_USE_NEON

View File

@ -0,0 +1,365 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Utilities for processing transparent channel.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
//------------------------------------------------------------------------------
static int DispatchAlpha_SSE2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
int i, j;
const __m128i zero = _mm_setzero_si128();
const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB
const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
__m128i all_alphas = all_0xff;
// We must be able to access 3 extra bytes after the last written byte
// 'dst[4 * width - 4]', because we don't know if alpha is the first or the
// last byte of the quadruplet.
const int limit = (width - 1) & ~7;
for (j = 0; j < height; ++j) {
__m128i* out = (__m128i*)dst;
for (i = 0; i < limit; i += 8) {
// load 8 alpha bytes
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
// load 8 dst pixels (32 bytes)
const __m128i b0_lo = _mm_loadu_si128(out + 0);
const __m128i b0_hi = _mm_loadu_si128(out + 1);
// mask dst alpha values
const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask);
const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask);
// combine
const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo);
const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi);
// store
_mm_storeu_si128(out + 0, b2_lo);
_mm_storeu_si128(out + 1, b2_hi);
// accumulate eight alpha 'and' in parallel
all_alphas = _mm_and_si128(all_alphas, a0);
out += 2;
}
for (; i < width; ++i) {
const uint32_t alpha_value = alpha[i];
dst[4 * i] = alpha_value;
alpha_and &= alpha_value;
}
alpha += alpha_stride;
dst += dst_stride;
}
// Combine the eight alpha 'and' into a 8-bit mask.
alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
return (alpha_and != 0xff);
}
static void DispatchAlphaToGreen_SSE2(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride) {
int i, j;
const __m128i zero = _mm_setzero_si128();
const int limit = width & ~15;
for (j = 0; j < height; ++j) {
for (i = 0; i < limit; i += 16) { // process 16 alpha bytes
const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]);
const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first!
const __m128i b1 = _mm_unpackhi_epi8(zero, a0);
const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero);
const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero);
_mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo);
_mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi);
_mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo);
_mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi);
}
for (; i < width; ++i) dst[i] = alpha[i] << 8;
alpha += alpha_stride;
dst += dst_stride;
}
}
static int ExtractAlpha_SSE2(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
int i, j;
const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha
const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
__m128i all_alphas = all_0xff;
// We must be able to access 3 extra bytes after the last written byte
// 'src[4 * width - 4]', because we don't know if alpha is the first or the
// last byte of the quadruplet.
const int limit = (width - 1) & ~7;
for (j = 0; j < height; ++j) {
const __m128i* src = (const __m128i*)argb;
for (i = 0; i < limit; i += 8) {
// load 32 argb bytes
const __m128i a0 = _mm_loadu_si128(src + 0);
const __m128i a1 = _mm_loadu_si128(src + 1);
const __m128i b0 = _mm_and_si128(a0, a_mask);
const __m128i b1 = _mm_and_si128(a1, a_mask);
const __m128i c0 = _mm_packs_epi32(b0, b1);
const __m128i d0 = _mm_packus_epi16(c0, c0);
// store
_mm_storel_epi64((__m128i*)&alpha[i], d0);
// accumulate eight alpha 'and' in parallel
all_alphas = _mm_and_si128(all_alphas, d0);
src += 2;
}
for (; i < width; ++i) {
const uint32_t alpha_value = argb[4 * i];
alpha[i] = alpha_value;
alpha_and &= alpha_value;
}
argb += argb_stride;
alpha += alpha_stride;
}
// Combine the eight alpha 'and' into a 8-bit mask.
alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
return (alpha_and == 0xff);
}
//------------------------------------------------------------------------------
// Non-dither premultiplied modes
#define MULTIPLIER(a) ((a) * 0x8081)
#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
// We can't use a 'const int' for the SHUFFLE value, because it has to be an
// immediate in the _mm_shufflexx_epi16() instruction. We really need a macro.
// We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit
// value.
#define APPLY_ALPHA(RGBX, SHUFFLE) do { \
const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX)); \
const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero); \
const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero); \
const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask); \
const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask); \
const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \
const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \
const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \
const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \
/* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */ \
const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo); \
const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi); \
const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult); \
const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult); \
const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7); \
const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7); \
const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi); \
_mm_storeu_si128((__m128i*)&(RGBX), A3); \
} while (0)
static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
int w, int h, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i kMult = _mm_set1_epi16(0x8081u);
const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0);
const int kSpan = 4;
while (h-- > 0) {
uint32_t* const rgbx = (uint32_t*)rgba;
int i;
if (!alpha_first) {
for (i = 0; i + kSpan <= w; i += kSpan) {
APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3));
}
} else {
for (i = 0; i + kSpan <= w; i += kSpan) {
APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1));
}
}
// Finish with left-overs.
for (; i < w; ++i) {
uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
const uint32_t a = alpha[4 * i];
if (a != 0xff) {
const uint32_t mult = MULTIPLIER(a);
rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
}
}
rgba += stride;
}
}
#undef MULTIPLIER
#undef PREMULTIPLY
//------------------------------------------------------------------------------
// Alpha detection
static int HasAlpha8b_SSE2(const uint8_t* src, int length) {
const __m128i all_0xff = _mm_set1_epi8((char)0xff);
int i = 0;
for (; i + 16 <= length; i += 16) {
const __m128i v = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i bits = _mm_cmpeq_epi8(v, all_0xff);
const int mask = _mm_movemask_epi8(bits);
if (mask != 0xffff) return 1;
}
for (; i < length; ++i) if (src[i] != 0xff) return 1;
return 0;
}
static int HasAlpha32b_SSE2(const uint8_t* src, int length) {
const __m128i alpha_mask = _mm_set1_epi32(0xff);
const __m128i all_0xff = _mm_set1_epi8((char)0xff);
int i = 0;
// We don't know if we can access the last 3 bytes after the last alpha
// value 'src[4 * length - 4]' (because we don't know if alpha is the first
// or the last byte of the quadruplet). Hence the '-3' protection below.
length = length * 4 - 3; // size in bytes
for (; i + 64 <= length; i += 64) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32));
const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48));
const __m128i b0 = _mm_and_si128(a0, alpha_mask);
const __m128i b1 = _mm_and_si128(a1, alpha_mask);
const __m128i b2 = _mm_and_si128(a2, alpha_mask);
const __m128i b3 = _mm_and_si128(a3, alpha_mask);
const __m128i c0 = _mm_packs_epi32(b0, b1);
const __m128i c1 = _mm_packs_epi32(b2, b3);
const __m128i d = _mm_packus_epi16(c0, c1);
const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
const int mask = _mm_movemask_epi8(bits);
if (mask != 0xffff) return 1;
}
for (; i + 32 <= length; i += 32) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16));
const __m128i b0 = _mm_and_si128(a0, alpha_mask);
const __m128i b1 = _mm_and_si128(a1, alpha_mask);
const __m128i c = _mm_packs_epi32(b0, b1);
const __m128i d = _mm_packus_epi16(c, c);
const __m128i bits = _mm_cmpeq_epi8(d, all_0xff);
const int mask = _mm_movemask_epi8(bits);
if (mask != 0xffff) return 1;
}
for (; i <= length; i += 4) if (src[i] != 0xff) return 1;
return 0;
}
static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) {
const __m128i m_color = _mm_set1_epi32(color);
const __m128i zero = _mm_setzero_si128();
int i = 0;
for (; i + 8 <= length; i += 8) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4));
const __m128i b0 = _mm_srai_epi32(a0, 24);
const __m128i b1 = _mm_srai_epi32(a1, 24);
const __m128i c0 = _mm_cmpeq_epi32(b0, zero);
const __m128i c1 = _mm_cmpeq_epi32(b1, zero);
const __m128i d0 = _mm_and_si128(c0, m_color);
const __m128i d1 = _mm_and_si128(c1, m_color);
const __m128i e0 = _mm_andnot_si128(c0, a0);
const __m128i e1 = _mm_andnot_si128(c1, a1);
_mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0));
_mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1));
}
for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color;
}
// -----------------------------------------------------------------------------
// Apply alpha value to rows
static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
int x = 0;
if (!inverse) {
const int kSpan = 2;
const __m128i zero = _mm_setzero_si128();
const __m128i k128 = _mm_set1_epi16(128);
const __m128i kMult = _mm_set1_epi16(0x0101);
const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0);
for (x = 0; x + kSpan <= width; x += kSpan) {
// To compute 'result = (int)(a * x / 255. + .5)', we use:
// tmp = a * v + 128, result = (tmp * 0x0101u) >> 16
const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]);
const __m128i A1 = _mm_unpacklo_epi8(A0, zero);
const __m128i A2 = _mm_or_si128(A1, kMask);
const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3));
const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3));
// here, A4 = [ff a0 a0 a0][ff a1 a1 a1]
const __m128i A5 = _mm_mullo_epi16(A4, A1);
const __m128i A6 = _mm_add_epi16(A5, k128);
const __m128i A7 = _mm_mulhi_epu16(A6, kMult);
const __m128i A10 = _mm_packus_epi16(A7, zero);
_mm_storel_epi64((__m128i*)&ptr[x], A10);
}
}
width -= x;
if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse);
}
static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse) {
int x = 0;
if (!inverse) {
const __m128i zero = _mm_setzero_si128();
const __m128i k128 = _mm_set1_epi16(128);
const __m128i kMult = _mm_set1_epi16(0x0101);
for (x = 0; x + 8 <= width; x += 8) {
const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
const __m128i v2 = _mm_mullo_epi16(v1, a1);
const __m128i v3 = _mm_add_epi16(v2, k128);
const __m128i v4 = _mm_mulhi_epu16(v3, kMult);
const __m128i v5 = _mm_packus_epi16(v4, zero);
_mm_storel_epi64((__m128i*)&ptr[x], v5);
}
}
width -= x;
if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse);
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitAlphaProcessingSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
WebPMultARGBRow = MultARGBRow_SSE2;
WebPMultRow = MultRow_SSE2;
WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
WebPDispatchAlpha = DispatchAlpha_SSE2;
WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2;
WebPExtractAlpha = ExtractAlpha_SSE2;
WebPHasAlpha8b = HasAlpha8b_SSE2;
WebPHasAlpha32b = HasAlpha32b_SSE2;
WebPAlphaReplace = AlphaReplace_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2)
#endif // WEBP_USE_SSE2

View File

@ -0,0 +1,92 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Utilities for processing transparent channel, SSE4.1 variant.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
//------------------------------------------------------------------------------
static int ExtractAlpha_SSE41(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride) {
// alpha_and stores an 'and' operation of all the alpha[] values. The final
// value is not 0xff if any of the alpha[] is not equal to 0xff.
uint32_t alpha_and = 0xff;
int i, j;
const __m128i all_0xff = _mm_set1_epi32(~0u);
__m128i all_alphas = all_0xff;
// We must be able to access 3 extra bytes after the last written byte
// 'src[4 * width - 4]', because we don't know if alpha is the first or the
// last byte of the quadruplet.
const int limit = (width - 1) & ~15;
const __m128i kCstAlpha0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, 12, 8, 4, 0);
const __m128i kCstAlpha1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
12, 8, 4, 0, -1, -1, -1, -1);
const __m128i kCstAlpha2 = _mm_set_epi8(-1, -1, -1, -1, 12, 8, 4, 0,
-1, -1, -1, -1, -1, -1, -1, -1);
const __m128i kCstAlpha3 = _mm_set_epi8(12, 8, 4, 0, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1);
for (j = 0; j < height; ++j) {
const __m128i* src = (const __m128i*)argb;
for (i = 0; i < limit; i += 16) {
// load 64 argb bytes
const __m128i a0 = _mm_loadu_si128(src + 0);
const __m128i a1 = _mm_loadu_si128(src + 1);
const __m128i a2 = _mm_loadu_si128(src + 2);
const __m128i a3 = _mm_loadu_si128(src + 3);
const __m128i b0 = _mm_shuffle_epi8(a0, kCstAlpha0);
const __m128i b1 = _mm_shuffle_epi8(a1, kCstAlpha1);
const __m128i b2 = _mm_shuffle_epi8(a2, kCstAlpha2);
const __m128i b3 = _mm_shuffle_epi8(a3, kCstAlpha3);
const __m128i c0 = _mm_or_si128(b0, b1);
const __m128i c1 = _mm_or_si128(b2, b3);
const __m128i d0 = _mm_or_si128(c0, c1);
// store
_mm_storeu_si128((__m128i*)&alpha[i], d0);
// accumulate sixteen alpha 'and' in parallel
all_alphas = _mm_and_si128(all_alphas, d0);
src += 4;
}
for (; i < width; ++i) {
const uint32_t alpha_value = argb[4 * i];
alpha[i] = alpha_value;
alpha_and &= alpha_value;
}
argb += argb_stride;
alpha += alpha_stride;
}
// Combine the sixteen alpha 'and' into an 8-bit mask.
alpha_and |= 0xff00u; // pretend the upper bits [8..15] were tested ok.
alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
return (alpha_and == 0xffffu);
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitAlphaProcessingSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
WebPExtractAlpha = ExtractAlpha_SSE41;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE41)
#endif // WEBP_USE_SSE41

View File

@ -0,0 +1,194 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 code common to several files.
//
// Author: Vincent Rabaud (vrabaud@google.com)
#ifndef WEBP_DSP_COMMON_SSE2_H_
#define WEBP_DSP_COMMON_SSE2_H_
#ifdef __cplusplus
extern "C" {
#endif
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
//------------------------------------------------------------------------------
// Quite useful macro for debugging. Left here for convenience.
#if 0
#include <stdio.h>
static WEBP_INLINE void PrintReg(const __m128i r, const char* const name,
int size) {
int n;
union {
__m128i r;
uint8_t i8[16];
uint16_t i16[8];
uint32_t i32[4];
uint64_t i64[2];
} tmp;
tmp.r = r;
fprintf(stderr, "%s\t: ", name);
if (size == 8) {
for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
} else if (size == 16) {
for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
} else if (size == 32) {
for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
} else {
for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]);
}
fprintf(stderr, "\n");
}
#endif
//------------------------------------------------------------------------------
// Math functions.
// Return the sum of all the 8b in the register.
static WEBP_INLINE int VP8HorizontalAdd8b(const __m128i* const a) {
const __m128i zero = _mm_setzero_si128();
const __m128i sad8x2 = _mm_sad_epu8(*a, zero);
// sum the two sads: sad8x2[0:1] + sad8x2[8:9]
const __m128i sum = _mm_add_epi32(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
return _mm_cvtsi128_si32(sum);
}
// Transpose two 4x4 16b matrices horizontally stored in registers.
static WEBP_INLINE void VP8Transpose_2_4x4_16b(
const __m128i* const in0, const __m128i* const in1,
const __m128i* const in2, const __m128i* const in3, __m128i* const out0,
__m128i* const out1, __m128i* const out2, __m128i* const out3) {
// Transpose the two 4x4.
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
const __m128i transpose0_0 = _mm_unpacklo_epi16(*in0, *in1);
const __m128i transpose0_1 = _mm_unpacklo_epi16(*in2, *in3);
const __m128i transpose0_2 = _mm_unpackhi_epi16(*in0, *in1);
const __m128i transpose0_3 = _mm_unpackhi_epi16(*in2, *in3);
// a00 a10 a01 a11 a02 a12 a03 a13
// a20 a30 a21 a31 a22 a32 a23 a33
// b00 b10 b01 b11 b02 b12 b03 b13
// b20 b30 b21 b31 b22 b32 b23 b33
const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
// a00 a10 a20 a30 a01 a11 a21 a31
// b00 b10 b20 b30 b01 b11 b21 b31
// a02 a12 a22 a32 a03 a13 a23 a33
// b02 b12 a22 b32 b03 b13 b23 b33
*out0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
*out1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
*out2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
*out3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
// a00 a10 a20 a30 b00 b10 b20 b30
// a01 a11 a21 a31 b01 b11 b21 b31
// a02 a12 a22 a32 b02 b12 b22 b32
// a03 a13 a23 a33 b03 b13 b23 b33
}
//------------------------------------------------------------------------------
// Channel mixing.
// Function used several times in VP8PlanarTo24b.
// It samples the in buffer as follows: one every two unsigned char is stored
// at the beginning of the buffer, while the other half is stored at the end.
#define VP8PlanarTo24bHelper(IN, OUT) \
do { \
const __m128i v_mask = _mm_set1_epi16(0x00ff); \
/* Take one every two upper 8b values.*/ \
(OUT##0) = _mm_packus_epi16(_mm_and_si128((IN##0), v_mask), \
_mm_and_si128((IN##1), v_mask)); \
(OUT##1) = _mm_packus_epi16(_mm_and_si128((IN##2), v_mask), \
_mm_and_si128((IN##3), v_mask)); \
(OUT##2) = _mm_packus_epi16(_mm_and_si128((IN##4), v_mask), \
_mm_and_si128((IN##5), v_mask)); \
/* Take one every two lower 8b values.*/ \
(OUT##3) = _mm_packus_epi16(_mm_srli_epi16((IN##0), 8), \
_mm_srli_epi16((IN##1), 8)); \
(OUT##4) = _mm_packus_epi16(_mm_srli_epi16((IN##2), 8), \
_mm_srli_epi16((IN##3), 8)); \
(OUT##5) = _mm_packus_epi16(_mm_srli_epi16((IN##4), 8), \
_mm_srli_epi16((IN##5), 8)); \
} while (0)
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
static WEBP_INLINE void VP8PlanarTo24b_SSE2(
__m128i* const in0, __m128i* const in1, __m128i* const in2,
__m128i* const in3, __m128i* const in4, __m128i* const in5) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
// around as follows:
// Input:
// r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
// Split the 6 registers in two sets of 3 registers: the first set as the even
// 8b bytes, the second the odd ones:
// r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
// Repeat the same permutations twice more:
// r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
// r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
__m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
VP8PlanarTo24bHelper(*in, tmp);
VP8PlanarTo24bHelper(tmp, *in);
VP8PlanarTo24bHelper(*in, tmp);
// We need to do it two more times than the example as we have sixteen bytes.
{
__m128i out0, out1, out2, out3, out4, out5;
VP8PlanarTo24bHelper(tmp, out);
VP8PlanarTo24bHelper(out, *in);
}
}
#undef VP8PlanarTo24bHelper
// Convert four packed four-channel buffers like argbargbargbargb... into the
// split channels aaaaa ... rrrr ... gggg .... bbbbb ......
static WEBP_INLINE void VP8L32bToPlanar_SSE2(__m128i* const in0,
__m128i* const in1,
__m128i* const in2,
__m128i* const in3) {
// Column-wise transpose.
const __m128i A0 = _mm_unpacklo_epi8(*in0, *in1);
const __m128i A1 = _mm_unpackhi_epi8(*in0, *in1);
const __m128i A2 = _mm_unpacklo_epi8(*in2, *in3);
const __m128i A3 = _mm_unpackhi_epi8(*in2, *in3);
const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
const __m128i B2 = _mm_unpacklo_epi8(A2, A3);
const __m128i B3 = _mm_unpackhi_epi8(A2, A3);
// C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0
// C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0
const __m128i C0 = _mm_unpacklo_epi8(B0, B1);
const __m128i C1 = _mm_unpackhi_epi8(B0, B1);
const __m128i C2 = _mm_unpacklo_epi8(B2, B3);
const __m128i C3 = _mm_unpackhi_epi8(B2, B3);
// Gather the channels.
*in0 = _mm_unpackhi_epi64(C1, C3);
*in1 = _mm_unpacklo_epi64(C1, C3);
*in2 = _mm_unpackhi_epi64(C0, C2);
*in3 = _mm_unpacklo_epi64(C0, C2);
}
#endif // WEBP_USE_SSE2
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DSP_COMMON_SSE2_H_

View File

@ -0,0 +1,132 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE4 code common to several files.
//
// Author: Vincent Rabaud (vrabaud@google.com)
#ifndef WEBP_DSP_COMMON_SSE41_H_
#define WEBP_DSP_COMMON_SSE41_H_
#ifdef __cplusplus
extern "C" {
#endif
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
//------------------------------------------------------------------------------
// Channel mixing.
// Shuffles the input buffer as A0 0 0 A1 0 0 A2 ...
#define WEBP_SSE41_SHUFF(OUT, IN0, IN1) \
OUT##0 = _mm_shuffle_epi8(*IN0, shuff0); \
OUT##1 = _mm_shuffle_epi8(*IN0, shuff1); \
OUT##2 = _mm_shuffle_epi8(*IN0, shuff2); \
OUT##3 = _mm_shuffle_epi8(*IN1, shuff0); \
OUT##4 = _mm_shuffle_epi8(*IN1, shuff1); \
OUT##5 = _mm_shuffle_epi8(*IN1, shuff2);
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
static WEBP_INLINE void VP8PlanarTo24b_SSE41(
__m128i* const in0, __m128i* const in1, __m128i* const in2,
__m128i* const in3, __m128i* const in4, __m128i* const in5) {
__m128i R0, R1, R2, R3, R4, R5;
__m128i G0, G1, G2, G3, G4, G5;
__m128i B0, B1, B2, B3, B4, B5;
// Process R.
{
const __m128i shuff0 = _mm_set_epi8(
5, -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0);
const __m128i shuff1 = _mm_set_epi8(
-1, 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1);
const __m128i shuff2 = _mm_set_epi8(
-1, -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1);
WEBP_SSE41_SHUFF(R, in0, in1)
}
// Process G.
{
// Same as before, just shifted to the left by one and including the right
// padding.
const __m128i shuff0 = _mm_set_epi8(
-1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1);
const __m128i shuff1 = _mm_set_epi8(
10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5);
const __m128i shuff2 = _mm_set_epi8(
-1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1);
WEBP_SSE41_SHUFF(G, in2, in3)
}
// Process B.
{
const __m128i shuff0 = _mm_set_epi8(
-1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1, -1);
const __m128i shuff1 = _mm_set_epi8(
-1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5, -1);
const __m128i shuff2 = _mm_set_epi8(
15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1, 10);
WEBP_SSE41_SHUFF(B, in4, in5)
}
// OR the different channels.
{
const __m128i RG0 = _mm_or_si128(R0, G0);
const __m128i RG1 = _mm_or_si128(R1, G1);
const __m128i RG2 = _mm_or_si128(R2, G2);
const __m128i RG3 = _mm_or_si128(R3, G3);
const __m128i RG4 = _mm_or_si128(R4, G4);
const __m128i RG5 = _mm_or_si128(R5, G5);
*in0 = _mm_or_si128(RG0, B0);
*in1 = _mm_or_si128(RG1, B1);
*in2 = _mm_or_si128(RG2, B2);
*in3 = _mm_or_si128(RG3, B3);
*in4 = _mm_or_si128(RG4, B4);
*in5 = _mm_or_si128(RG5, B5);
}
}
#undef WEBP_SSE41_SHUFF
// Convert four packed four-channel buffers like argbargbargbargb... into the
// split channels aaaaa ... rrrr ... gggg .... bbbbb ......
static WEBP_INLINE void VP8L32bToPlanar_SSE41(__m128i* const in0,
__m128i* const in1,
__m128i* const in2,
__m128i* const in3) {
// aaaarrrrggggbbbb
const __m128i shuff0 =
_mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
const __m128i A0 = _mm_shuffle_epi8(*in0, shuff0);
const __m128i A1 = _mm_shuffle_epi8(*in1, shuff0);
const __m128i A2 = _mm_shuffle_epi8(*in2, shuff0);
const __m128i A3 = _mm_shuffle_epi8(*in3, shuff0);
// A0A1R0R1
// G0G1B0B1
// A2A3R2R3
// G0G1B0B1
const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
const __m128i B1 = _mm_unpackhi_epi32(A0, A1);
const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
const __m128i B3 = _mm_unpackhi_epi32(A2, A3);
*in3 = _mm_unpacklo_epi64(B0, B2);
*in2 = _mm_unpackhi_epi64(B0, B2);
*in1 = _mm_unpacklo_epi64(B1, B3);
*in0 = _mm_unpackhi_epi64(B1, B3);
}
#endif // WEBP_USE_SSE41
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DSP_COMMON_SSE41_H_

411
vendor/github.com/sizeofint/webpanimation/dsp_cost.c generated vendored Normal file
View File

@ -0,0 +1,411 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#include "enc_cost_enc.h"
//------------------------------------------------------------------------------
// Boolean-cost cost table
const uint16_t VP8EntropyCost[256] = {
1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
1178, 1152, 1110, 1076, 1061, 1024, 1024, 992, 968, 951,
939, 911, 896, 878, 871, 854, 838, 820, 811, 794,
786, 768, 768, 752, 740, 732, 720, 709, 704, 690,
683, 672, 666, 655, 647, 640, 631, 622, 615, 607,
598, 592, 586, 576, 572, 564, 559, 555, 547, 541,
534, 528, 522, 512, 512, 504, 500, 494, 488, 483,
477, 473, 467, 461, 458, 452, 448, 443, 438, 434,
427, 424, 419, 415, 410, 406, 403, 399, 394, 390,
384, 384, 377, 374, 370, 366, 362, 359, 355, 351,
347, 342, 342, 336, 333, 330, 326, 323, 320, 316,
312, 308, 305, 302, 299, 296, 293, 288, 287, 283,
280, 277, 274, 272, 268, 266, 262, 256, 256, 256,
251, 248, 245, 242, 240, 237, 234, 232, 228, 226,
223, 221, 218, 216, 214, 211, 208, 205, 203, 201,
198, 196, 192, 191, 188, 187, 183, 181, 179, 176,
175, 171, 171, 168, 165, 163, 160, 159, 156, 154,
152, 150, 148, 146, 144, 142, 139, 138, 135, 133,
131, 128, 128, 125, 123, 121, 119, 117, 115, 113,
111, 110, 107, 105, 103, 102, 100, 98, 96, 94,
92, 91, 89, 86, 86, 83, 82, 80, 77, 76,
74, 73, 71, 69, 67, 66, 64, 63, 61, 59,
57, 55, 54, 52, 51, 49, 47, 46, 44, 43,
41, 40, 38, 36, 35, 33, 32, 30, 29, 27,
25, 24, 22, 21, 19, 18, 16, 15, 13, 12,
10, 9, 7, 6, 4, 3
};
//------------------------------------------------------------------------------
// Level cost tables
// fixed costs for coding levels, deduce from the coding tree.
// This is only the part that doesn't depend on the probability state.
const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
0, 256, 256, 256, 256, 432, 618, 630,
731, 640, 640, 828, 901, 948, 1021, 1101,
1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
};
//------------------------------------------------------------------------------
// Tables for level coding
const uint8_t VP8EncBands[16 + 1] = {
0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
0 // sentinel
};
//------------------------------------------------------------------------------
// Mode costs
static int GetResidualCost_C(int ctx0, const VP8Residual* const res) {
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
if (res->last < 0) {
return VP8BitCost(0, p0);
}
for (; n < res->last; ++n) {
const int v = abs(res->coeffs[n]);
const int ctx = (v >= 2) ? 2 : v;
cost += VP8LevelCost(t, v);
t = costs[n + 1][ctx];
}
// Last coefficient is always non-zero
{
const int v = abs(res->coeffs[n]);
assert(v != 0);
cost += VP8LevelCost(t, v);
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = (v == 1) ? 1 : 2;
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
static void SetResidualCoeffs_C(const int16_t* const coeffs,
VP8Residual* const res) {
int n;
res->last = -1;
assert(res->first == 0 || coeffs[0] == 0);
for (n = 15; n >= 0; --n) {
if (coeffs[n]) {
res->last = n;
break;
}
}
res->coeffs = coeffs;
}
//------------------------------------------------------------------------------
// init function
VP8GetResidualCostFunc VP8GetResidualCost;
VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
extern void VP8EncDspCostInitMIPS32(void);
extern void VP8EncDspCostInitMIPSdspR2(void);
extern void VP8EncDspCostInitSSE2(void);
extern void VP8EncDspCostInitNEON(void);
WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
VP8GetResidualCost = GetResidualCost_C;
VP8SetResidualCoeffs = SetResidualCoeffs_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8EncDspCostInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8EncDspCostInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspCostInitSSE2();
}
#endif
#if defined(WEBP_USE_NEON)
if (VP8GetCPUInfo(kNEON)) {
VP8EncDspCostInitNEON();
}
#endif
}
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,154 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "enc_cost_enc.h"
static int GetResidualCost_MIPS32(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
const int16_t* res_coeffs = res->coeffs;
const int res_last = res->last;
const int const_max_level = MAX_VARIABLE_LEVEL;
const int const_2 = 2;
const uint16_t** p_costs = &costs[n][0];
const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
if (res->last < 0) {
return VP8BitCost(0, p0);
}
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"subu %[temp1], %[res_last], %[n] \n\t"
"sll %[temp0], %[n], 1 \n\t"
"blez %[temp1], 2f \n\t"
" addu %[res_coeffs], %[res_coeffs], %[temp0] \n\t"
"1: \n\t"
"lh %[v_reg], 0(%[res_coeffs]) \n\t"
"addiu %[n], %[n], 1 \n\t"
"negu %[temp0], %[v_reg] \n\t"
"slti %[temp1], %[v_reg], 0 \n\t"
"movn %[v_reg], %[temp0], %[temp1] \n\t"
"sltiu %[temp0], %[v_reg], 2 \n\t"
"move %[ctx_reg], %[v_reg] \n\t"
"movz %[ctx_reg], %[const_2], %[temp0] \n\t"
"sll %[temp1], %[v_reg], 1 \n\t"
"addu %[temp1], %[temp1], %[VP8LevelFixedCosts] \n\t"
"lhu %[temp1], 0(%[temp1]) \n\t"
"slt %[temp0], %[v_reg], %[const_max_level] \n\t"
"movz %[v_reg], %[const_max_level], %[temp0] \n\t"
"addu %[cost], %[cost], %[temp1] \n\t"
"sll %[v_reg], %[v_reg], 1 \n\t"
"sll %[ctx_reg], %[ctx_reg], 2 \n\t"
"addu %[v_reg], %[v_reg], %[t] \n\t"
"lhu %[temp0], 0(%[v_reg]) \n\t"
"addu %[p_costs], %[p_costs], %[inc_p_costs] \n\t"
"addu %[t], %[p_costs], %[ctx_reg] \n\t"
"addu %[cost], %[cost], %[temp0] \n\t"
"addiu %[res_coeffs], %[res_coeffs], 2 \n\t"
"bne %[n], %[res_last], 1b \n\t"
" lw %[t], 0(%[t]) \n\t"
"2: \n\t"
".set pop \n\t"
: [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
[ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [res_coeffs]"+&r"(res_coeffs)
: [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
[VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
[inc_p_costs]"r"(inc_p_costs)
: "memory"
);
// Last coefficient is always non-zero
{
const int v = abs(res->coeffs[n]);
assert(v != 0);
cost += VP8LevelCost(t, v);
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = (v == 1) ? 1 : 2;
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
static void SetResidualCoeffs_MIPS32(const int16_t* const coeffs,
VP8Residual* const res) {
const int16_t* p_coeffs = (int16_t*)coeffs;
int temp0, temp1, temp2, n, n1;
assert(res->first == 0 || coeffs[0] == 0);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"addiu %[p_coeffs], %[p_coeffs], 28 \n\t"
"li %[n], 15 \n\t"
"li %[temp2], -1 \n\t"
"0: \n\t"
"ulw %[temp0], 0(%[p_coeffs]) \n\t"
"beqz %[temp0], 1f \n\t"
#if defined(WORDS_BIGENDIAN)
" sll %[temp1], %[temp0], 16 \n\t"
#else
" srl %[temp1], %[temp0], 16 \n\t"
#endif
"addiu %[n1], %[n], -1 \n\t"
"movz %[temp0], %[n1], %[temp1] \n\t"
"movn %[temp0], %[n], %[temp1] \n\t"
"j 2f \n\t"
" addiu %[temp2], %[temp0], 0 \n\t"
"1: \n\t"
"addiu %[n], %[n], -2 \n\t"
"bgtz %[n], 0b \n\t"
" addiu %[p_coeffs], %[p_coeffs], -4 \n\t"
"2: \n\t"
".set pop \n\t"
: [p_coeffs]"+&r"(p_coeffs), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[n]"=&r"(n), [n1]"=&r"(n1)
:
: "memory"
);
res->last = temp2;
res->coeffs = coeffs;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
VP8GetResidualCost = GetResidualCost_MIPS32;
VP8SetResidualCoeffs = SetResidualCoeffs_MIPS32;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPS32)
#endif // WEBP_USE_MIPS32

View File

@ -0,0 +1,107 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Djordje Pesut (djordje.pesut@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "enc_cost_enc.h"
static int GetResidualCost_MIPSdspR2(int ctx0, const VP8Residual* const res) {
int temp0, temp1;
int v_reg, ctx_reg;
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
const int16_t* res_coeffs = res->coeffs;
const int res_last = res->last;
const int const_max_level = MAX_VARIABLE_LEVEL;
const int const_2 = 2;
const uint16_t** p_costs = &costs[n][0];
const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
if (res->last < 0) {
return VP8BitCost(0, p0);
}
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"subu %[temp1], %[res_last], %[n] \n\t"
"blez %[temp1], 2f \n\t"
" nop \n\t"
"1: \n\t"
"sll %[temp0], %[n], 1 \n\t"
"lhx %[v_reg], %[temp0](%[res_coeffs]) \n\t"
"addiu %[n], %[n], 1 \n\t"
"absq_s.w %[v_reg], %[v_reg] \n\t"
"sltiu %[temp0], %[v_reg], 2 \n\t"
"move %[ctx_reg], %[v_reg] \n\t"
"movz %[ctx_reg], %[const_2], %[temp0] \n\t"
"sll %[temp1], %[v_reg], 1 \n\t"
"lhx %[temp1], %[temp1](%[VP8LevelFixedCosts]) \n\t"
"slt %[temp0], %[v_reg], %[const_max_level] \n\t"
"movz %[v_reg], %[const_max_level], %[temp0] \n\t"
"addu %[cost], %[cost], %[temp1] \n\t"
"sll %[v_reg], %[v_reg], 1 \n\t"
"sll %[ctx_reg], %[ctx_reg], 2 \n\t"
"lhx %[temp0], %[v_reg](%[t]) \n\t"
"addu %[p_costs], %[p_costs], %[inc_p_costs] \n\t"
"addu %[t], %[p_costs], %[ctx_reg] \n\t"
"addu %[cost], %[cost], %[temp0] \n\t"
"bne %[n], %[res_last], 1b \n\t"
" lw %[t], 0(%[t]) \n\t"
"2: \n\t"
".set pop \n\t"
: [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
[ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1)
: [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
[VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
[res_coeffs]"r"(res_coeffs), [inc_p_costs]"r"(inc_p_costs)
: "memory"
);
// Last coefficient is always non-zero
{
const int v = abs(res->coeffs[n]);
assert(v != 0);
cost += VP8LevelCost(t, v);
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = (v == 1) ? 1 : 2;
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
VP8GetResidualCost = GetResidualCost_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View File

@ -0,0 +1,122 @@
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARM NEON version of cost functions
#include "dsp_dsp.h"
#if defined(WEBP_USE_NEON)
#include "dsp_neon.h"
#include "enc_cost_enc.h"
static const uint8_t position[16] = { 1, 2, 3, 4, 5, 6, 7, 8,
9, 10, 11, 12, 13, 14, 15, 16 };
static void SetResidualCoeffs_NEON(const int16_t* const coeffs,
VP8Residual* const res) {
const int16x8_t minus_one = vdupq_n_s16(-1);
const int16x8_t coeffs_0 = vld1q_s16(coeffs);
const int16x8_t coeffs_1 = vld1q_s16(coeffs + 8);
const uint16x8_t eob_0 = vtstq_s16(coeffs_0, minus_one);
const uint16x8_t eob_1 = vtstq_s16(coeffs_1, minus_one);
const uint8x16_t eob = vcombine_u8(vqmovn_u16(eob_0), vqmovn_u16(eob_1));
const uint8x16_t masked = vandq_u8(eob, vld1q_u8(position));
#ifdef __aarch64__
res->last = vmaxvq_u8(masked) - 1;
#else
const uint8x8_t eob_8x8 = vmax_u8(vget_low_u8(masked), vget_high_u8(masked));
const uint16x8_t eob_16x8 = vmovl_u8(eob_8x8);
const uint16x4_t eob_16x4 =
vmax_u16(vget_low_u16(eob_16x8), vget_high_u16(eob_16x8));
const uint32x4_t eob_32x4 = vmovl_u16(eob_16x4);
uint32x2_t eob_32x2 =
vmax_u32(vget_low_u32(eob_32x4), vget_high_u32(eob_32x4));
eob_32x2 = vpmax_u32(eob_32x2, eob_32x2);
vst1_lane_s32(&res->last, vreinterpret_s32_u32(eob_32x2), 0);
--res->last;
#endif // __aarch64__
res->coeffs = coeffs;
}
static int GetResidualCost_NEON(int ctx0, const VP8Residual* const res) {
uint8_t levels[16], ctxs[16];
uint16_t abs_levels[16];
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
if (res->last < 0) {
return VP8BitCost(0, p0);
}
{ // precompute clamped levels and contexts, packed to 8b.
const uint8x16_t kCst2 = vdupq_n_u8(2);
const uint8x16_t kCst67 = vdupq_n_u8(MAX_VARIABLE_LEVEL);
const int16x8_t c0 = vld1q_s16(res->coeffs);
const int16x8_t c1 = vld1q_s16(res->coeffs + 8);
const uint16x8_t E0 = vreinterpretq_u16_s16(vabsq_s16(c0));
const uint16x8_t E1 = vreinterpretq_u16_s16(vabsq_s16(c1));
const uint8x16_t F = vcombine_u8(vqmovn_u16(E0), vqmovn_u16(E1));
const uint8x16_t G = vminq_u8(F, kCst2); // context = 0,1,2
const uint8x16_t H = vminq_u8(F, kCst67); // clamp_level in [0..67]
vst1q_u8(ctxs, G);
vst1q_u8(levels, H);
vst1q_u16(abs_levels, E0);
vst1q_u16(abs_levels + 8, E1);
}
for (; n < res->last; ++n) {
const int ctx = ctxs[n];
const int level = levels[n];
const int flevel = abs_levels[n]; // full level
cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost()
t = costs[n + 1][ctx];
}
// Last coefficient is always non-zero
{
const int level = levels[n];
const int flevel = abs_levels[n];
assert(flevel != 0);
cost += VP8LevelFixedCosts[flevel] + t[level];
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = ctxs[n];
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitNEON(void) {
VP8SetResidualCoeffs = SetResidualCoeffs_NEON;
VP8GetResidualCost = GetResidualCost_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8EncDspCostInitNEON)
#endif // WEBP_USE_NEON

View File

@ -0,0 +1,119 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of cost functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE2)
#include <emmintrin.h>
#include "enc_cost_enc.h"
#include "enc_vp8i_enc.h"
#include "utils_utils.h"
//------------------------------------------------------------------------------
static void SetResidualCoeffs_SSE2(const int16_t* const coeffs,
VP8Residual* const res) {
const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
// Use SSE2 to compare 16 values with a single instruction.
const __m128i zero = _mm_setzero_si128();
const __m128i m0 = _mm_packs_epi16(c0, c1);
const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
// Get the comparison results as a bitmask into 16bits. Negate the mask to get
// the position of entries that are not equal to zero. We don't need to mask
// out least significant bits according to res->first, since coeffs[0] is 0
// if res->first > 0.
const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
// The position of the most significant non-zero bit indicates the position of
// the last non-zero value.
assert(res->first == 0 || coeffs[0] == 0);
res->last = mask ? BitsLog2Floor(mask) : -1;
res->coeffs = coeffs;
}
static int GetResidualCost_SSE2(int ctx0, const VP8Residual* const res) {
uint8_t levels[16], ctxs[16];
uint16_t abs_levels[16];
int n = res->first;
// should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
const int p0 = res->prob[n][ctx0][0];
CostArrayPtr const costs = res->costs;
const uint16_t* t = costs[n][ctx0];
// bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
// (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
// be missing during the loop.
int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
if (res->last < 0) {
return VP8BitCost(0, p0);
}
{ // precompute clamped levels and contexts, packed to 8b.
const __m128i zero = _mm_setzero_si128();
const __m128i kCst2 = _mm_set1_epi8(2);
const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
const __m128i D0 = _mm_sub_epi16(zero, c0);
const __m128i D1 = _mm_sub_epi16(zero, c1);
const __m128i E0 = _mm_max_epi16(c0, D0); // abs(v), 16b
const __m128i E1 = _mm_max_epi16(c1, D1);
const __m128i F = _mm_packs_epi16(E0, E1);
const __m128i G = _mm_min_epu8(F, kCst2); // context = 0,1,2
const __m128i H = _mm_min_epu8(F, kCst67); // clamp_level in [0..67]
_mm_storeu_si128((__m128i*)&ctxs[0], G);
_mm_storeu_si128((__m128i*)&levels[0], H);
_mm_storeu_si128((__m128i*)&abs_levels[0], E0);
_mm_storeu_si128((__m128i*)&abs_levels[8], E1);
}
for (; n < res->last; ++n) {
const int ctx = ctxs[n];
const int level = levels[n];
const int flevel = abs_levels[n]; // full level
cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost()
t = costs[n + 1][ctx];
}
// Last coefficient is always non-zero
{
const int level = levels[n];
const int flevel = abs_levels[n];
assert(flevel != 0);
cost += VP8LevelFixedCosts[flevel] + t[level];
if (n < 15) {
const int b = VP8EncBands[n + 1];
const int ctx = ctxs[n];
const int last_p0 = res->prob[b][ctx][0];
cost += VP8BitCost(0, last_p0);
}
}
return cost;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspCostInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
VP8SetResidualCoeffs = SetResidualCoeffs_SSE2;
VP8GetResidualCost = GetResidualCost_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8EncDspCostInitSSE2)
#endif // WEBP_USE_SSE2

252
vendor/github.com/sizeofint/webpanimation/dsp_cpu.c generated vendored Normal file
View File

@ -0,0 +1,252 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// CPU detection
//
// Author: Christian Duvivier (cduvivier@google.com)
#include "dsp_dsp.h"
#if defined(WEBP_HAVE_NEON_RTCD)
#include <stdio.h>
#include <string.h>
#endif
#if defined(WEBP_ANDROID_NEON)
#include <cpu-features.h>
#endif
//------------------------------------------------------------------------------
// SSE2 detection.
//
// apple/darwin gcc-4.0.1 defines __PIC__, but not __pic__ with -fPIC.
#if (defined(__pic__) || defined(__PIC__)) && defined(__i386__)
static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
__asm__ volatile (
"mov %%ebx, %%edi\n"
"cpuid\n"
"xchg %%edi, %%ebx\n"
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type), "c"(0));
}
#elif defined(__x86_64__) && \
(defined(__code_model_medium__) || defined(__code_model_large__)) && \
defined(__PIC__)
static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
__asm__ volatile (
"xchg{q}\t{%%rbx}, %q1\n"
"cpuid\n"
"xchg{q}\t{%%rbx}, %q1\n"
: "=a"(cpu_info[0]), "=&r"(cpu_info[1]), "=c"(cpu_info[2]),
"=d"(cpu_info[3])
: "a"(info_type), "c"(0));
}
#elif defined(__i386__) || defined(__x86_64__)
static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
__asm__ volatile (
"cpuid\n"
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
: "a"(info_type), "c"(0));
}
#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
#if defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 150030729 // >= VS2008 SP1
#include <intrin.h>
#define GetCPUInfo(info, type) __cpuidex(info, type, 0) // set ecx=0
#define WEBP_HAVE_MSC_CPUID
#elif _MSC_VER > 1310
#include <intrin.h>
#define GetCPUInfo __cpuid
#define WEBP_HAVE_MSC_CPUID
#endif
#endif
// NaCl has no support for xgetbv or the raw opcode.
#if !defined(__native_client__) && (defined(__i386__) || defined(__x86_64__))
static WEBP_INLINE uint64_t xgetbv(void) {
const uint32_t ecx = 0;
uint32_t eax, edx;
// Use the raw opcode for xgetbv for compatibility with older toolchains.
__asm__ volatile (
".byte 0x0f, 0x01, 0xd0\n"
: "=a"(eax), "=d"(edx) : "c" (ecx));
return ((uint64_t)edx << 32) | eax;
}
#elif (defined(_M_X64) || defined(_M_IX86)) && \
defined(_MSC_FULL_VER) && _MSC_FULL_VER >= 160040219 // >= VS2010 SP1
#include <immintrin.h>
#define xgetbv() _xgetbv(0)
#elif defined(_MSC_VER) && defined(_M_IX86)
static WEBP_INLINE uint64_t xgetbv(void) {
uint32_t eax_, edx_;
__asm {
xor ecx, ecx // ecx = 0
// Use the raw opcode for xgetbv for compatibility with older toolchains.
__asm _emit 0x0f __asm _emit 0x01 __asm _emit 0xd0
mov eax_, eax
mov edx_, edx
}
return ((uint64_t)edx_ << 32) | eax_;
}
#else
#define xgetbv() 0U // no AVX for older x64 or unrecognized toolchains.
#endif
#if defined(__i386__) || defined(__x86_64__) || defined(WEBP_HAVE_MSC_CPUID)
// helper function for run-time detection of slow SSSE3 platforms
static int CheckSlowModel(int info) {
// Table listing display models with longer latencies for the bsr instruction
// (ie 2 cycles vs 10/16 cycles) and some SSSE3 instructions like pshufb.
// Refer to Intel 64 and IA-32 Architectures Optimization Reference Manual.
static const uint8_t kSlowModels[] = {
0x37, 0x4a, 0x4d, // Silvermont Microarchitecture
0x1c, 0x26, 0x27 // Atom Microarchitecture
};
const uint32_t model = ((info & 0xf0000) >> 12) | ((info >> 4) & 0xf);
const uint32_t family = (info >> 8) & 0xf;
if (family == 0x06) {
size_t i;
for (i = 0; i < sizeof(kSlowModels) / sizeof(kSlowModels[0]); ++i) {
if (model == kSlowModels[i]) return 1;
}
}
return 0;
}
static int x86CPUInfo(CPUFeature feature) {
int max_cpuid_value;
int cpu_info[4];
int is_intel = 0;
// get the highest feature value cpuid supports
GetCPUInfo(cpu_info, 0);
max_cpuid_value = cpu_info[0];
if (max_cpuid_value < 1) {
return 0;
} else {
const int VENDOR_ID_INTEL_EBX = 0x756e6547; // uneG
const int VENDOR_ID_INTEL_EDX = 0x49656e69; // Ieni
const int VENDOR_ID_INTEL_ECX = 0x6c65746e; // letn
is_intel = (cpu_info[1] == VENDOR_ID_INTEL_EBX &&
cpu_info[2] == VENDOR_ID_INTEL_ECX &&
cpu_info[3] == VENDOR_ID_INTEL_EDX); // genuine Intel?
}
GetCPUInfo(cpu_info, 1);
if (feature == kSSE2) {
return !!(cpu_info[3] & (1 << 26));
}
if (feature == kSSE3) {
return !!(cpu_info[2] & (1 << 0));
}
if (feature == kSlowSSSE3) {
if (is_intel && (cpu_info[2] & (1 << 9))) { // SSSE3?
return CheckSlowModel(cpu_info[0]);
}
return 0;
}
if (feature == kSSE4_1) {
return !!(cpu_info[2] & (1 << 19));
}
if (feature == kAVX) {
// bits 27 (OSXSAVE) & 28 (256-bit AVX)
if ((cpu_info[2] & 0x18000000) == 0x18000000) {
// XMM state and YMM state enabled by the OS.
return (xgetbv() & 0x6) == 0x6;
}
}
if (feature == kAVX2) {
if (x86CPUInfo(kAVX) && max_cpuid_value >= 7) {
GetCPUInfo(cpu_info, 7);
return !!(cpu_info[1] & (1 << 5));
}
}
return 0;
}
VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
#elif defined(WEBP_ANDROID_NEON) // NB: needs to be before generic NEON test.
static int AndroidCPUInfo(CPUFeature feature) {
const AndroidCpuFamily cpu_family = android_getCpuFamily();
const uint64_t cpu_features = android_getCpuFeatures();
if (feature == kNEON) {
return cpu_family == ANDROID_CPU_FAMILY_ARM &&
(cpu_features & ANDROID_CPU_ARM_FEATURE_NEON) != 0;
}
return 0;
}
VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
#elif defined(EMSCRIPTEN) // also needs to be before generic NEON test
// Use compile flags as an indicator of SIMD support instead of a runtime check.
static int wasmCPUInfo(CPUFeature feature) {
switch (feature) {
#ifdef WEBP_USE_SSE2
case kSSE2:
return 1;
#endif
#ifdef WEBP_USE_SSE41
case kSSE3:
case kSlowSSSE3:
case kSSE4_1:
return 1;
#endif
#ifdef WEBP_USE_NEON
case kNEON:
return 1;
#endif
default:
break;
}
return 0;
}
VP8CPUInfo VP8GetCPUInfo = wasmCPUInfo;
#elif defined(WEBP_USE_NEON)
// define a dummy function to enable turning off NEON at runtime by setting
// VP8DecGetCPUInfo = NULL
static int armCPUInfo(CPUFeature feature) {
if (feature != kNEON) return 0;
#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
{
int has_neon = 0;
char line[200];
FILE* const cpuinfo = fopen("/proc/cpuinfo", "r");
if (cpuinfo == NULL) return 0;
while (fgets(line, sizeof(line), cpuinfo)) {
if (!strncmp(line, "Features", 8)) {
if (strstr(line, " neon ") != NULL) {
has_neon = 1;
break;
}
}
}
fclose(cpuinfo);
return has_neon;
}
#else
return 1;
#endif
}
VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) || \
defined(WEBP_USE_MSA)
static int mipsCPUInfo(CPUFeature feature) {
if ((feature == kMIPS32) || (feature == kMIPSdspR2) || (feature == kMSA)) {
return 1;
} else {
return 0;
}
}
VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
#else
VP8CPUInfo VP8GetCPUInfo = NULL;
#endif

887
vendor/github.com/sizeofint/webpanimation/dsp_dec.c generated vendored Normal file
View File

@ -0,0 +1,887 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical decoding functions, default plain-C implementations.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include "dsp_dsp.h"
#include "dec_vp8i_dec.h"
#include "utils_utils.h"
//------------------------------------------------------------------------------
static WEBP_INLINE uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
}
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
#define STORE(x, y, v) \
dst[(x) + (y) * BPS] = clip_8b(dst[(x) + (y) * BPS] + ((v) >> 3))
#define STORE2(y, dc, d, c) do { \
const int DC = (dc); \
STORE(0, y, DC + (d)); \
STORE(1, y, DC + (c)); \
STORE(2, y, DC - (c)); \
STORE(3, y, DC - (d)); \
} while (0)
#define MUL1(a) ((((a) * 20091) >> 16) + (a))
#define MUL2(a) (((a) * 35468) >> 16)
#if !WEBP_NEON_OMIT_C_CODE
static void TransformOne_C(const int16_t* in, uint8_t* dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
for (i = 0; i < 4; ++i) { // vertical pass
const int a = in[0] + in[8]; // [-4096, 4094]
const int b = in[0] - in[8]; // [-4095, 4095]
const int c = MUL2(in[4]) - MUL1(in[12]); // [-3783, 3783]
const int d = MUL1(in[4]) + MUL2(in[12]); // [-3785, 3781]
tmp[0] = a + d; // [-7881, 7875]
tmp[1] = b + c; // [-7878, 7878]
tmp[2] = b - c; // [-7878, 7878]
tmp[3] = a - d; // [-7877, 7879]
tmp += 4;
in++;
}
// Each pass is expanding the dynamic range by ~3.85 (upper bound).
// The exact value is (2. + (20091 + 35468) / 65536).
// After the second pass, maximum interval is [-3794, 3794], assuming
// an input in [-2048, 2047] interval. We then need to add a dst value
// in the [0, 255] range.
// In the worst case scenario, the input to clip_8b() can be as large as
// [-60713, 60968].
tmp = C;
for (i = 0; i < 4; ++i) { // horizontal pass
const int dc = tmp[0] + 4;
const int a = dc + tmp[8];
const int b = dc - tmp[8];
const int c = MUL2(tmp[4]) - MUL1(tmp[12]);
const int d = MUL1(tmp[4]) + MUL2(tmp[12]);
STORE(0, 0, a + d);
STORE(1, 0, b + c);
STORE(2, 0, b - c);
STORE(3, 0, a - d);
tmp++;
dst += BPS;
}
}
// Simplified transform when only in[0], in[1] and in[4] are non-zero
static void TransformAC3_C(const int16_t* in, uint8_t* dst) {
const int a = in[0] + 4;
const int c4 = MUL2(in[4]);
const int d4 = MUL1(in[4]);
const int c1 = MUL2(in[1]);
const int d1 = MUL1(in[1]);
STORE2(0, a + d4, d1, c1);
STORE2(1, a + c4, d1, c1);
STORE2(2, a - c4, d1, c1);
STORE2(3, a - d4, d1, c1);
}
#undef MUL1
#undef MUL2
#undef STORE2
static void TransformTwo_C(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne_C(in, dst);
if (do_two) {
TransformOne_C(in + 16, dst + 4);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void TransformUV_C(const int16_t* in, uint8_t* dst) {
VP8Transform(in + 0 * 16, dst, 1);
VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
}
#if !WEBP_NEON_OMIT_C_CODE
static void TransformDC_C(const int16_t* in, uint8_t* dst) {
const int DC = in[0] + 4;
int i, j;
for (j = 0; j < 4; ++j) {
for (i = 0; i < 4; ++i) {
STORE(i, j, DC);
}
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void TransformDCUV_C(const int16_t* in, uint8_t* dst) {
if (in[0 * 16]) VP8TransformDC(in + 0 * 16, dst);
if (in[1 * 16]) VP8TransformDC(in + 1 * 16, dst + 4);
if (in[2 * 16]) VP8TransformDC(in + 2 * 16, dst + 4 * BPS);
if (in[3 * 16]) VP8TransformDC(in + 3 * 16, dst + 4 * BPS + 4);
}
#undef STORE
//------------------------------------------------------------------------------
// Paragraph 14.3
#if !WEBP_NEON_OMIT_C_CODE
static void TransformWHT_C(const int16_t* in, int16_t* out) {
int tmp[16];
int i;
for (i = 0; i < 4; ++i) {
const int a0 = in[0 + i] + in[12 + i];
const int a1 = in[4 + i] + in[ 8 + i];
const int a2 = in[4 + i] - in[ 8 + i];
const int a3 = in[0 + i] - in[12 + i];
tmp[0 + i] = a0 + a1;
tmp[8 + i] = a0 - a1;
tmp[4 + i] = a3 + a2;
tmp[12 + i] = a3 - a2;
}
for (i = 0; i < 4; ++i) {
const int dc = tmp[0 + i * 4] + 3; // w/ rounder
const int a0 = dc + tmp[3 + i * 4];
const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4];
const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4];
const int a3 = dc - tmp[3 + i * 4];
out[ 0] = (a0 + a1) >> 3;
out[16] = (a3 + a2) >> 3;
out[32] = (a0 - a1) >> 3;
out[48] = (a3 - a2) >> 3;
out += 64;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
//------------------------------------------------------------------------------
// Intra predictions
#define DST(x, y) dst[(x) + (y) * BPS]
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
const uint8_t* top = dst - BPS;
const uint8_t* const clip0 = VP8kclip1 - top[-1];
int y;
for (y = 0; y < size; ++y) {
const uint8_t* const clip = clip0 + dst[-1];
int x;
for (x = 0; x < size; ++x) {
dst[x] = clip[top[x]];
}
dst += BPS;
}
}
static void TM4_C(uint8_t* dst) { TrueMotion(dst, 4); }
static void TM8uv_C(uint8_t* dst) { TrueMotion(dst, 8); }
static void TM16_C(uint8_t* dst) { TrueMotion(dst, 16); }
//------------------------------------------------------------------------------
// 16x16
static void VE16_C(uint8_t* dst) { // vertical
int j;
for (j = 0; j < 16; ++j) {
memcpy(dst + j * BPS, dst - BPS, 16);
}
}
static void HE16_C(uint8_t* dst) { // horizontal
int j;
for (j = 16; j > 0; --j) {
memset(dst, dst[-1], 16);
dst += BPS;
}
}
static WEBP_INLINE void Put16(int v, uint8_t* dst) {
int j;
for (j = 0; j < 16; ++j) {
memset(dst + j * BPS, v, 16);
}
}
static void DC16_C(uint8_t* dst) { // DC
int DC = 16;
int j;
for (j = 0; j < 16; ++j) {
DC += dst[-1 + j * BPS] + dst[j - BPS];
}
Put16(DC >> 5, dst);
}
static void DC16NoTop_C(uint8_t* dst) { // DC with top samples not available
int DC = 8;
int j;
for (j = 0; j < 16; ++j) {
DC += dst[-1 + j * BPS];
}
Put16(DC >> 4, dst);
}
static void DC16NoLeft_C(uint8_t* dst) { // DC with left samples not available
int DC = 8;
int i;
for (i = 0; i < 16; ++i) {
DC += dst[i - BPS];
}
Put16(DC >> 4, dst);
}
static void DC16NoTopLeft_C(uint8_t* dst) { // DC with no top and left samples
Put16(0x80, dst);
}
#endif // !WEBP_NEON_OMIT_C_CODE
VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
//------------------------------------------------------------------------------
// 4x4
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
#if !WEBP_NEON_OMIT_C_CODE
static void VE4_C(uint8_t* dst) { // vertical
const uint8_t* top = dst - BPS;
const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]),
AVG3(top[ 0], top[1], top[2]),
AVG3(top[ 1], top[2], top[3]),
AVG3(top[ 2], top[3], top[4])
};
int i;
for (i = 0; i < 4; ++i) {
memcpy(dst + i * BPS, vals, sizeof(vals));
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void HE4_C(uint8_t* dst) { // horizontal
const int A = dst[-1 - BPS];
const int B = dst[-1];
const int C = dst[-1 + BPS];
const int D = dst[-1 + 2 * BPS];
const int E = dst[-1 + 3 * BPS];
WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(A, B, C));
WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(B, C, D));
WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(C, D, E));
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
}
#if !WEBP_NEON_OMIT_C_CODE
static void DC4_C(uint8_t* dst) { // DC
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
dc >>= 3;
for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
}
static void RD4_C(uint8_t* dst) { // Down-right
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
const int L = dst[-1 + 3 * BPS];
const int X = dst[-1 - BPS];
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
const int D = dst[3 - BPS];
DST(0, 3) = AVG3(J, K, L);
DST(1, 3) = DST(0, 2) = AVG3(I, J, K);
DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J);
DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
DST(3, 0) = AVG3(D, C, B);
}
static void LD4_C(uint8_t* dst) { // Down-Left
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
const int D = dst[3 - BPS];
const int E = dst[4 - BPS];
const int F = dst[5 - BPS];
const int G = dst[6 - BPS];
const int H = dst[7 - BPS];
DST(0, 0) = AVG3(A, B, C);
DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
DST(3, 3) = AVG3(G, H, H);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void VR4_C(uint8_t* dst) { // Vertical-Right
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
const int X = dst[-1 - BPS];
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
const int D = dst[3 - BPS];
DST(0, 0) = DST(1, 2) = AVG2(X, A);
DST(1, 0) = DST(2, 2) = AVG2(A, B);
DST(2, 0) = DST(3, 2) = AVG2(B, C);
DST(3, 0) = AVG2(C, D);
DST(0, 3) = AVG3(K, J, I);
DST(0, 2) = AVG3(J, I, X);
DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
DST(3, 1) = AVG3(B, C, D);
}
static void VL4_C(uint8_t* dst) { // Vertical-Left
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
const int D = dst[3 - BPS];
const int E = dst[4 - BPS];
const int F = dst[5 - BPS];
const int G = dst[6 - BPS];
const int H = dst[7 - BPS];
DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
DST(3, 2) = AVG3(E, F, G);
DST(3, 3) = AVG3(F, G, H);
}
static void HU4_C(uint8_t* dst) { // Horizontal-Up
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
const int L = dst[-1 + 3 * BPS];
DST(0, 0) = AVG2(I, J);
DST(2, 0) = DST(0, 1) = AVG2(J, K);
DST(2, 1) = DST(0, 2) = AVG2(K, L);
DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
DST(3, 2) = DST(2, 2) =
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static void HD4_C(uint8_t* dst) { // Horizontal-Down
const int I = dst[-1 + 0 * BPS];
const int J = dst[-1 + 1 * BPS];
const int K = dst[-1 + 2 * BPS];
const int L = dst[-1 + 3 * BPS];
const int X = dst[-1 - BPS];
const int A = dst[0 - BPS];
const int B = dst[1 - BPS];
const int C = dst[2 - BPS];
DST(0, 0) = DST(2, 1) = AVG2(I, X);
DST(0, 1) = DST(2, 2) = AVG2(J, I);
DST(0, 2) = DST(2, 3) = AVG2(K, J);
DST(0, 3) = AVG2(L, K);
DST(3, 0) = AVG3(A, B, C);
DST(2, 0) = AVG3(X, A, B);
DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
DST(1, 3) = AVG3(L, K, J);
}
#undef DST
#undef AVG3
#undef AVG2
VP8PredFunc VP8PredLuma4[NUM_BMODES];
//------------------------------------------------------------------------------
// Chroma
#if !WEBP_NEON_OMIT_C_CODE
static void VE8uv_C(uint8_t* dst) { // vertical
int j;
for (j = 0; j < 8; ++j) {
memcpy(dst + j * BPS, dst - BPS, 8);
}
}
static void HE8uv_C(uint8_t* dst) { // horizontal
int j;
for (j = 0; j < 8; ++j) {
memset(dst, dst[-1], 8);
dst += BPS;
}
}
// helper for chroma-DC predictions
static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
int j;
for (j = 0; j < 8; ++j) {
memset(dst + j * BPS, value, 8);
}
}
static void DC8uv_C(uint8_t* dst) { // DC
int dc0 = 8;
int i;
for (i = 0; i < 8; ++i) {
dc0 += dst[i - BPS] + dst[-1 + i * BPS];
}
Put8x8uv(dc0 >> 4, dst);
}
static void DC8uvNoLeft_C(uint8_t* dst) { // DC with no left samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
dc0 += dst[i - BPS];
}
Put8x8uv(dc0 >> 3, dst);
}
static void DC8uvNoTop_C(uint8_t* dst) { // DC with no top samples
int dc0 = 4;
int i;
for (i = 0; i < 8; ++i) {
dc0 += dst[-1 + i * BPS];
}
Put8x8uv(dc0 >> 3, dst);
}
static void DC8uvNoTopLeft_C(uint8_t* dst) { // DC with nothing
Put8x8uv(0x80, dst);
}
#endif // !WEBP_NEON_OMIT_C_CODE
VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
//------------------------------------------------------------------------------
// Edge filtering functions
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
// 4 pixels in, 2 pixels out
static WEBP_INLINE void DoFilter2_C(uint8_t* p, int step) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1]; // in [-893,892]
const int a1 = VP8ksclip2[(a + 4) >> 3]; // in [-16,15]
const int a2 = VP8ksclip2[(a + 3) >> 3];
p[-step] = VP8kclip1[p0 + a2];
p[ 0] = VP8kclip1[q0 - a1];
}
// 4 pixels in, 4 pixels out
static WEBP_INLINE void DoFilter4_C(uint8_t* p, int step) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0);
const int a1 = VP8ksclip2[(a + 4) >> 3];
const int a2 = VP8ksclip2[(a + 3) >> 3];
const int a3 = (a1 + 1) >> 1;
p[-2*step] = VP8kclip1[p1 + a3];
p[- step] = VP8kclip1[p0 + a2];
p[ 0] = VP8kclip1[q0 - a1];
p[ step] = VP8kclip1[q1 - a3];
}
// 6 pixels in, 6 pixels out
static WEBP_INLINE void DoFilter6_C(uint8_t* p, int step) {
const int p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
const int q0 = p[0], q1 = p[step], q2 = p[2*step];
const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
// a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
const int a1 = (27 * a + 63) >> 7; // eq. to ((3 * a + 7) * 9) >> 7
const int a2 = (18 * a + 63) >> 7; // eq. to ((2 * a + 7) * 9) >> 7
const int a3 = (9 * a + 63) >> 7; // eq. to ((1 * a + 7) * 9) >> 7
p[-3*step] = VP8kclip1[p2 + a3];
p[-2*step] = VP8kclip1[p1 + a2];
p[- step] = VP8kclip1[p0 + a1];
p[ 0] = VP8kclip1[q0 - a1];
p[ step] = VP8kclip1[q1 - a2];
p[ 2*step] = VP8kclip1[q2 - a3];
}
static WEBP_INLINE int Hev(const uint8_t* p, int step, int thresh) {
const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
return (VP8kabs0[p1 - p0] > thresh) || (VP8kabs0[q1 - q0] > thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int NeedsFilter_C(const uint8_t* p, int step, int t) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
return ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) <= t);
}
#endif // !WEBP_NEON_OMIT_C_CODE
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static WEBP_INLINE int NeedsFilter2_C(const uint8_t* p,
int step, int t, int it) {
const int p3 = p[-4 * step], p2 = p[-3 * step], p1 = p[-2 * step];
const int p0 = p[-step], q0 = p[0];
const int q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
if ((4 * VP8kabs0[p0 - q0] + VP8kabs0[p1 - q1]) > t) return 0;
return VP8kabs0[p3 - p2] <= it && VP8kabs0[p2 - p1] <= it &&
VP8kabs0[p1 - p0] <= it && VP8kabs0[q3 - q2] <= it &&
VP8kabs0[q2 - q1] <= it && VP8kabs0[q1 - q0] <= it;
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
#if !WEBP_NEON_OMIT_C_CODE
static void SimpleVFilter16_C(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
if (NeedsFilter_C(p + i, stride, thresh2)) {
DoFilter2_C(p + i, stride);
}
}
}
static void SimpleHFilter16_C(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
if (NeedsFilter_C(p + i * stride, 1, thresh2)) {
DoFilter2_C(p + i * stride, 1);
}
}
}
static void SimpleVFilter16i_C(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16_C(p, stride, thresh);
}
}
static void SimpleHFilter16i_C(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16_C(p, stride, thresh);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Complex In-loop filtering (Paragraph 15.3)
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static WEBP_INLINE void FilterLoop26_C(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh,
int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
if (Hev(p, hstride, hev_thresh)) {
DoFilter2_C(p, hstride);
} else {
DoFilter6_C(p, hstride);
}
}
p += vstride;
}
}
static WEBP_INLINE void FilterLoop24_C(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh,
int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
if (NeedsFilter2_C(p, hstride, thresh2, ithresh)) {
if (Hev(p, hstride, hev_thresh)) {
DoFilter2_C(p, hstride);
} else {
DoFilter4_C(p, hstride);
}
}
p += vstride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE
// on macroblock edges
static void VFilter16_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
static void HFilter16_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
// on three inner edges
static void VFilter16i_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
FilterLoop24_C(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter16i_C(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
FilterLoop24_C(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE
// 8-pixels wide variant, for chroma filtering
static void VFilter8_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26_C(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26_C(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
#if !WEBP_NEON_OMIT_C_CODE
static void VFilter8i_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static void HFilter8i_C(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24_C(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24_C(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
static void DitherCombine8x8_C(const uint8_t* dither, uint8_t* dst,
int dst_stride) {
int i, j;
for (j = 0; j < 8; ++j) {
for (i = 0; i < 8; ++i) {
const int delta0 = dither[i] - VP8_DITHER_AMP_CENTER;
const int delta1 =
(delta0 + VP8_DITHER_DESCALE_ROUNDER) >> VP8_DITHER_DESCALE;
dst[i] = clip_8b((int)dst[i] + delta1);
}
dst += dst_stride;
dither += 8;
}
}
//------------------------------------------------------------------------------
VP8DecIdct2 VP8Transform;
VP8DecIdct VP8TransformAC3;
VP8DecIdct VP8TransformUV;
VP8DecIdct VP8TransformDC;
VP8DecIdct VP8TransformDCUV;
VP8LumaFilterFunc VP8VFilter16;
VP8LumaFilterFunc VP8HFilter16;
VP8ChromaFilterFunc VP8VFilter8;
VP8ChromaFilterFunc VP8HFilter8;
VP8LumaFilterFunc VP8VFilter16i;
VP8LumaFilterFunc VP8HFilter16i;
VP8ChromaFilterFunc VP8VFilter8i;
VP8ChromaFilterFunc VP8HFilter8i;
VP8SimpleFilterFunc VP8SimpleVFilter16;
VP8SimpleFilterFunc VP8SimpleHFilter16;
VP8SimpleFilterFunc VP8SimpleVFilter16i;
VP8SimpleFilterFunc VP8SimpleHFilter16i;
void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
int dst_stride);
extern void VP8DspInitSSE2(void);
extern void VP8DspInitSSE41(void);
extern void VP8DspInitNEON(void);
extern void VP8DspInitMIPS32(void);
extern void VP8DspInitMIPSdspR2(void);
extern void VP8DspInitMSA(void);
WEBP_DSP_INIT_FUNC(VP8DspInit) {
VP8InitClipTables();
#if !WEBP_NEON_OMIT_C_CODE
VP8TransformWHT = TransformWHT_C;
VP8Transform = TransformTwo_C;
VP8TransformDC = TransformDC_C;
VP8TransformAC3 = TransformAC3_C;
#endif
VP8TransformUV = TransformUV_C;
VP8TransformDCUV = TransformDCUV_C;
#if !WEBP_NEON_OMIT_C_CODE
VP8VFilter16 = VFilter16_C;
VP8VFilter16i = VFilter16i_C;
VP8HFilter16 = HFilter16_C;
VP8VFilter8 = VFilter8_C;
VP8VFilter8i = VFilter8i_C;
VP8SimpleVFilter16 = SimpleVFilter16_C;
VP8SimpleHFilter16 = SimpleHFilter16_C;
VP8SimpleVFilter16i = SimpleVFilter16i_C;
VP8SimpleHFilter16i = SimpleHFilter16i_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
VP8HFilter16i = HFilter16i_C;
VP8HFilter8 = HFilter8_C;
VP8HFilter8i = HFilter8i_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE
VP8PredLuma4[0] = DC4_C;
VP8PredLuma4[1] = TM4_C;
VP8PredLuma4[2] = VE4_C;
VP8PredLuma4[4] = RD4_C;
VP8PredLuma4[6] = LD4_C;
#endif
VP8PredLuma4[3] = HE4_C;
VP8PredLuma4[5] = VR4_C;
VP8PredLuma4[7] = VL4_C;
VP8PredLuma4[8] = HD4_C;
VP8PredLuma4[9] = HU4_C;
#if !WEBP_NEON_OMIT_C_CODE
VP8PredLuma16[0] = DC16_C;
VP8PredLuma16[1] = TM16_C;
VP8PredLuma16[2] = VE16_C;
VP8PredLuma16[3] = HE16_C;
VP8PredLuma16[4] = DC16NoTop_C;
VP8PredLuma16[5] = DC16NoLeft_C;
VP8PredLuma16[6] = DC16NoTopLeft_C;
VP8PredChroma8[0] = DC8uv_C;
VP8PredChroma8[1] = TM8uv_C;
VP8PredChroma8[2] = VE8uv_C;
VP8PredChroma8[3] = HE8uv_C;
VP8PredChroma8[4] = DC8uvNoTop_C;
VP8PredChroma8[5] = DC8uvNoLeft_C;
VP8PredChroma8[6] = DC8uvNoTopLeft_C;
#endif
VP8DitherCombine8x8 = DitherCombine8x8_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8DspInitSSE2();
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8DspInitSSE41();
}
#endif
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8DspInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8DspInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_MSA)
if (VP8GetCPUInfo(kMSA)) {
VP8DspInitMSA();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8DspInitNEON();
}
#endif
assert(VP8TransformWHT != NULL);
assert(VP8Transform != NULL);
assert(VP8TransformDC != NULL);
assert(VP8TransformAC3 != NULL);
assert(VP8TransformUV != NULL);
assert(VP8TransformDCUV != NULL);
assert(VP8VFilter16 != NULL);
assert(VP8HFilter16 != NULL);
assert(VP8VFilter8 != NULL);
assert(VP8HFilter8 != NULL);
assert(VP8VFilter16i != NULL);
assert(VP8HFilter16i != NULL);
assert(VP8VFilter8i != NULL);
assert(VP8HFilter8i != NULL);
assert(VP8SimpleVFilter16 != NULL);
assert(VP8SimpleHFilter16 != NULL);
assert(VP8SimpleVFilter16i != NULL);
assert(VP8SimpleHFilter16i != NULL);
assert(VP8PredLuma4[0] != NULL);
assert(VP8PredLuma4[1] != NULL);
assert(VP8PredLuma4[2] != NULL);
assert(VP8PredLuma4[3] != NULL);
assert(VP8PredLuma4[4] != NULL);
assert(VP8PredLuma4[5] != NULL);
assert(VP8PredLuma4[6] != NULL);
assert(VP8PredLuma4[7] != NULL);
assert(VP8PredLuma4[8] != NULL);
assert(VP8PredLuma4[9] != NULL);
assert(VP8PredLuma16[0] != NULL);
assert(VP8PredLuma16[1] != NULL);
assert(VP8PredLuma16[2] != NULL);
assert(VP8PredLuma16[3] != NULL);
assert(VP8PredLuma16[4] != NULL);
assert(VP8PredLuma16[5] != NULL);
assert(VP8PredLuma16[6] != NULL);
assert(VP8PredChroma8[0] != NULL);
assert(VP8PredChroma8[1] != NULL);
assert(VP8PredChroma8[2] != NULL);
assert(VP8PredChroma8[3] != NULL);
assert(VP8PredChroma8[4] != NULL);
assert(VP8PredChroma8[5] != NULL);
assert(VP8PredChroma8[6] != NULL);
assert(VP8DitherCombine8x8 != NULL);
}

View File

@ -0,0 +1,369 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Clipping tables for filtering
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
// define to 0 to have run-time table initialization
#if !defined(USE_STATIC_TABLES)
#define USE_STATIC_TABLES 1 // ALTERNATE_CODE
#endif
#if (USE_STATIC_TABLES == 1)
static const uint8_t abs0[255 + 255 + 1] = {
0xff, 0xfe, 0xfd, 0xfc, 0xfb, 0xfa, 0xf9, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4,
0xf3, 0xf2, 0xf1, 0xf0, 0xef, 0xee, 0xed, 0xec, 0xeb, 0xea, 0xe9, 0xe8,
0xe7, 0xe6, 0xe5, 0xe4, 0xe3, 0xe2, 0xe1, 0xe0, 0xdf, 0xde, 0xdd, 0xdc,
0xdb, 0xda, 0xd9, 0xd8, 0xd7, 0xd6, 0xd5, 0xd4, 0xd3, 0xd2, 0xd1, 0xd0,
0xcf, 0xce, 0xcd, 0xcc, 0xcb, 0xca, 0xc9, 0xc8, 0xc7, 0xc6, 0xc5, 0xc4,
0xc3, 0xc2, 0xc1, 0xc0, 0xbf, 0xbe, 0xbd, 0xbc, 0xbb, 0xba, 0xb9, 0xb8,
0xb7, 0xb6, 0xb5, 0xb4, 0xb3, 0xb2, 0xb1, 0xb0, 0xaf, 0xae, 0xad, 0xac,
0xab, 0xaa, 0xa9, 0xa8, 0xa7, 0xa6, 0xa5, 0xa4, 0xa3, 0xa2, 0xa1, 0xa0,
0x9f, 0x9e, 0x9d, 0x9c, 0x9b, 0x9a, 0x99, 0x98, 0x97, 0x96, 0x95, 0x94,
0x93, 0x92, 0x91, 0x90, 0x8f, 0x8e, 0x8d, 0x8c, 0x8b, 0x8a, 0x89, 0x88,
0x87, 0x86, 0x85, 0x84, 0x83, 0x82, 0x81, 0x80, 0x7f, 0x7e, 0x7d, 0x7c,
0x7b, 0x7a, 0x79, 0x78, 0x77, 0x76, 0x75, 0x74, 0x73, 0x72, 0x71, 0x70,
0x6f, 0x6e, 0x6d, 0x6c, 0x6b, 0x6a, 0x69, 0x68, 0x67, 0x66, 0x65, 0x64,
0x63, 0x62, 0x61, 0x60, 0x5f, 0x5e, 0x5d, 0x5c, 0x5b, 0x5a, 0x59, 0x58,
0x57, 0x56, 0x55, 0x54, 0x53, 0x52, 0x51, 0x50, 0x4f, 0x4e, 0x4d, 0x4c,
0x4b, 0x4a, 0x49, 0x48, 0x47, 0x46, 0x45, 0x44, 0x43, 0x42, 0x41, 0x40,
0x3f, 0x3e, 0x3d, 0x3c, 0x3b, 0x3a, 0x39, 0x38, 0x37, 0x36, 0x35, 0x34,
0x33, 0x32, 0x31, 0x30, 0x2f, 0x2e, 0x2d, 0x2c, 0x2b, 0x2a, 0x29, 0x28,
0x27, 0x26, 0x25, 0x24, 0x23, 0x22, 0x21, 0x20, 0x1f, 0x1e, 0x1d, 0x1c,
0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04,
0x03, 0x02, 0x01, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14,
0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c,
0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc,
0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4,
0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec,
0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
};
static const uint8_t sclip1[1020 + 1020 + 1] = {
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
0x80, 0x80, 0x80, 0x80, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93,
0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab,
0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3,
0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3,
0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23,
0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f,
0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b,
0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47,
0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53,
0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f,
0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b,
0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
};
static const uint8_t sclip2[112 + 112 + 1] = {
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb,
0xfc, 0xfd, 0xfe, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f,
0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f, 0x0f
};
static const uint8_t clip1[255 + 511 + 1] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
0x00, 0x00, 0x00, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14,
0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c,
0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0,
0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc,
0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8,
0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4,
0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0,
0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec,
0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
};
#else
// uninitialized tables
static uint8_t abs0[255 + 255 + 1];
static int8_t sclip1[1020 + 1020 + 1];
static int8_t sclip2[112 + 112 + 1];
static uint8_t clip1[255 + 511 + 1];
// We declare this variable 'volatile' to prevent instruction reordering
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
#endif // USE_STATIC_TABLES
const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
const uint8_t* const VP8kclip1 = &clip1[255];
const uint8_t* const VP8kabs0 = &abs0[255];
WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
#if (USE_STATIC_TABLES == 0)
int i;
if (!tables_ok) {
for (i = -255; i <= 255; ++i) {
abs0[255 + i] = (i < 0) ? -i : i;
}
for (i = -1020; i <= 1020; ++i) {
sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
}
for (i = -112; i <= 112; ++i) {
sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
}
for (i = -255; i <= 255 + 255; ++i) {
clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
}
tables_ok = 1;
}
#endif // USE_STATIC_TABLES
}

View File

@ -0,0 +1,587 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of dsp functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "dsp_mips_macro.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
static WEBP_INLINE int abs_mips32(int x) {
const int sign = x >> 31;
return (x ^ sign) - sign;
}
// 4 pixels in, 2 pixels out
static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0) + VP8ksclip1[p1 - q1];
const int a1 = VP8ksclip2[(a + 4) >> 3];
const int a2 = VP8ksclip2[(a + 3) >> 3];
p[-step] = VP8kclip1[p0 + a2];
p[ 0] = VP8kclip1[q0 - a1];
}
// 4 pixels in, 4 pixels out
static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
const int a = 3 * (q0 - p0);
const int a1 = VP8ksclip2[(a + 4) >> 3];
const int a2 = VP8ksclip2[(a + 3) >> 3];
const int a3 = (a1 + 1) >> 1;
p[-2 * step] = VP8kclip1[p1 + a3];
p[- step] = VP8kclip1[p0 + a2];
p[ 0] = VP8kclip1[q0 - a1];
p[ step] = VP8kclip1[q1 - a3];
}
// 6 pixels in, 6 pixels out
static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
// a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
const int a1 = (27 * a + 63) >> 7; // eq. to ((3 * a + 7) * 9) >> 7
const int a2 = (18 * a + 63) >> 7; // eq. to ((2 * a + 7) * 9) >> 7
const int a3 = (9 * a + 63) >> 7; // eq. to ((1 * a + 7) * 9) >> 7
p[-3 * step] = VP8kclip1[p2 + a3];
p[-2 * step] = VP8kclip1[p1 + a2];
p[- step] = VP8kclip1[p0 + a1];
p[ 0] = VP8kclip1[q0 - a1];
p[ step] = VP8kclip1[q1 - a2];
p[ 2 * step] = VP8kclip1[q2 - a3];
}
static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
}
static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
}
static WEBP_INLINE int needs_filter2(const uint8_t* p,
int step, int t, int it) {
const int p3 = p[-4 * step], p2 = p[-3 * step];
const int p1 = p[-2 * step], p0 = p[-step];
const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
return 0;
}
return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
abs_mips32(p1 - p0) <= it && abs_mips32(q3 - q2) <= it &&
abs_mips32(q2 - q1) <= it && abs_mips32(q1 - q0) <= it;
}
static WEBP_INLINE void FilterLoop26(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
if (needs_filter2(p, hstride, thresh2, ithresh)) {
if (hev(p, hstride, hev_thresh)) {
do_filter2(p, hstride);
} else {
do_filter6(p, hstride);
}
}
p += vstride;
}
}
static WEBP_INLINE void FilterLoop24(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
while (size-- > 0) {
if (needs_filter2(p, hstride, thresh2, ithresh)) {
if (hev(p, hstride, hev_thresh)) {
do_filter2(p, hstride);
} else {
do_filter4(p, hstride);
}
}
p += vstride;
}
}
// on macroblock edges
static void VFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
static void HFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
}
static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
}
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
if (needs_filter(p + i, stride, thresh2)) {
do_filter2(p + i, stride);
}
}
}
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
for (i = 0; i < 16; ++i) {
if (needs_filter(p + i * stride, 1, thresh2)) {
do_filter2(p + i * stride, 1);
}
}
}
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16(p, stride, thresh);
}
}
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16(p, stride, thresh);
}
}
static void TransformOne(const int16_t* in, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14;
int temp15, temp16, temp17, temp18;
int16_t* p_in = (int16_t*)in;
// loops unrolled and merged to avoid usage of tmp buffer
// and to reduce number of stalls. MUL macro is written
// in assembler and inlined
__asm__ volatile(
"lh %[temp0], 0(%[in]) \n\t"
"lh %[temp8], 16(%[in]) \n\t"
"lh %[temp4], 8(%[in]) \n\t"
"lh %[temp12], 24(%[in]) \n\t"
"addu %[temp16], %[temp0], %[temp8] \n\t"
"subu %[temp0], %[temp0], %[temp8] \n\t"
"mul %[temp8], %[temp4], %[kC2] \n\t"
"mul %[temp17], %[temp12], %[kC1] \n\t"
"mul %[temp4], %[temp4], %[kC1] \n\t"
"mul %[temp12], %[temp12], %[kC2] \n\t"
"lh %[temp1], 2(%[in]) \n\t"
"lh %[temp5], 10(%[in]) \n\t"
"lh %[temp9], 18(%[in]) \n\t"
"lh %[temp13], 26(%[in]) \n\t"
"sra %[temp8], %[temp8], 16 \n\t"
"sra %[temp17], %[temp17], 16 \n\t"
"sra %[temp4], %[temp4], 16 \n\t"
"sra %[temp12], %[temp12], 16 \n\t"
"lh %[temp2], 4(%[in]) \n\t"
"lh %[temp6], 12(%[in]) \n\t"
"lh %[temp10], 20(%[in]) \n\t"
"lh %[temp14], 28(%[in]) \n\t"
"subu %[temp17], %[temp8], %[temp17] \n\t"
"addu %[temp4], %[temp4], %[temp12] \n\t"
"addu %[temp8], %[temp16], %[temp4] \n\t"
"subu %[temp4], %[temp16], %[temp4] \n\t"
"addu %[temp16], %[temp1], %[temp9] \n\t"
"subu %[temp1], %[temp1], %[temp9] \n\t"
"lh %[temp3], 6(%[in]) \n\t"
"lh %[temp7], 14(%[in]) \n\t"
"lh %[temp11], 22(%[in]) \n\t"
"lh %[temp15], 30(%[in]) \n\t"
"addu %[temp12], %[temp0], %[temp17] \n\t"
"subu %[temp0], %[temp0], %[temp17] \n\t"
"mul %[temp9], %[temp5], %[kC2] \n\t"
"mul %[temp17], %[temp13], %[kC1] \n\t"
"mul %[temp5], %[temp5], %[kC1] \n\t"
"mul %[temp13], %[temp13], %[kC2] \n\t"
"sra %[temp9], %[temp9], 16 \n\t"
"sra %[temp17], %[temp17], 16 \n\t"
"subu %[temp17], %[temp9], %[temp17] \n\t"
"sra %[temp5], %[temp5], 16 \n\t"
"sra %[temp13], %[temp13], 16 \n\t"
"addu %[temp5], %[temp5], %[temp13] \n\t"
"addu %[temp13], %[temp1], %[temp17] \n\t"
"subu %[temp1], %[temp1], %[temp17] \n\t"
"mul %[temp17], %[temp14], %[kC1] \n\t"
"mul %[temp14], %[temp14], %[kC2] \n\t"
"addu %[temp9], %[temp16], %[temp5] \n\t"
"subu %[temp5], %[temp16], %[temp5] \n\t"
"addu %[temp16], %[temp2], %[temp10] \n\t"
"subu %[temp2], %[temp2], %[temp10] \n\t"
"mul %[temp10], %[temp6], %[kC2] \n\t"
"mul %[temp6], %[temp6], %[kC1] \n\t"
"sra %[temp17], %[temp17], 16 \n\t"
"sra %[temp14], %[temp14], 16 \n\t"
"sra %[temp10], %[temp10], 16 \n\t"
"sra %[temp6], %[temp6], 16 \n\t"
"subu %[temp17], %[temp10], %[temp17] \n\t"
"addu %[temp6], %[temp6], %[temp14] \n\t"
"addu %[temp10], %[temp16], %[temp6] \n\t"
"subu %[temp6], %[temp16], %[temp6] \n\t"
"addu %[temp14], %[temp2], %[temp17] \n\t"
"subu %[temp2], %[temp2], %[temp17] \n\t"
"mul %[temp17], %[temp15], %[kC1] \n\t"
"mul %[temp15], %[temp15], %[kC2] \n\t"
"addu %[temp16], %[temp3], %[temp11] \n\t"
"subu %[temp3], %[temp3], %[temp11] \n\t"
"mul %[temp11], %[temp7], %[kC2] \n\t"
"mul %[temp7], %[temp7], %[kC1] \n\t"
"addiu %[temp8], %[temp8], 4 \n\t"
"addiu %[temp12], %[temp12], 4 \n\t"
"addiu %[temp0], %[temp0], 4 \n\t"
"addiu %[temp4], %[temp4], 4 \n\t"
"sra %[temp17], %[temp17], 16 \n\t"
"sra %[temp15], %[temp15], 16 \n\t"
"sra %[temp11], %[temp11], 16 \n\t"
"sra %[temp7], %[temp7], 16 \n\t"
"subu %[temp17], %[temp11], %[temp17] \n\t"
"addu %[temp7], %[temp7], %[temp15] \n\t"
"addu %[temp15], %[temp3], %[temp17] \n\t"
"subu %[temp3], %[temp3], %[temp17] \n\t"
"addu %[temp11], %[temp16], %[temp7] \n\t"
"subu %[temp7], %[temp16], %[temp7] \n\t"
"addu %[temp16], %[temp8], %[temp10] \n\t"
"subu %[temp8], %[temp8], %[temp10] \n\t"
"mul %[temp10], %[temp9], %[kC2] \n\t"
"mul %[temp17], %[temp11], %[kC1] \n\t"
"mul %[temp9], %[temp9], %[kC1] \n\t"
"mul %[temp11], %[temp11], %[kC2] \n\t"
"sra %[temp10], %[temp10], 16 \n\t"
"sra %[temp17], %[temp17], 16 \n\t"
"sra %[temp9], %[temp9], 16 \n\t"
"sra %[temp11], %[temp11], 16 \n\t"
"subu %[temp17], %[temp10], %[temp17] \n\t"
"addu %[temp11], %[temp9], %[temp11] \n\t"
"addu %[temp10], %[temp12], %[temp14] \n\t"
"subu %[temp12], %[temp12], %[temp14] \n\t"
"mul %[temp14], %[temp13], %[kC2] \n\t"
"mul %[temp9], %[temp15], %[kC1] \n\t"
"mul %[temp13], %[temp13], %[kC1] \n\t"
"mul %[temp15], %[temp15], %[kC2] \n\t"
"sra %[temp14], %[temp14], 16 \n\t"
"sra %[temp9], %[temp9], 16 \n\t"
"sra %[temp13], %[temp13], 16 \n\t"
"sra %[temp15], %[temp15], 16 \n\t"
"subu %[temp9], %[temp14], %[temp9] \n\t"
"addu %[temp15], %[temp13], %[temp15] \n\t"
"addu %[temp14], %[temp0], %[temp2] \n\t"
"subu %[temp0], %[temp0], %[temp2] \n\t"
"mul %[temp2], %[temp1], %[kC2] \n\t"
"mul %[temp13], %[temp3], %[kC1] \n\t"
"mul %[temp1], %[temp1], %[kC1] \n\t"
"mul %[temp3], %[temp3], %[kC2] \n\t"
"sra %[temp2], %[temp2], 16 \n\t"
"sra %[temp13], %[temp13], 16 \n\t"
"sra %[temp1], %[temp1], 16 \n\t"
"sra %[temp3], %[temp3], 16 \n\t"
"subu %[temp13], %[temp2], %[temp13] \n\t"
"addu %[temp3], %[temp1], %[temp3] \n\t"
"addu %[temp2], %[temp4], %[temp6] \n\t"
"subu %[temp4], %[temp4], %[temp6] \n\t"
"mul %[temp6], %[temp5], %[kC2] \n\t"
"mul %[temp1], %[temp7], %[kC1] \n\t"
"mul %[temp5], %[temp5], %[kC1] \n\t"
"mul %[temp7], %[temp7], %[kC2] \n\t"
"sra %[temp6], %[temp6], 16 \n\t"
"sra %[temp1], %[temp1], 16 \n\t"
"sra %[temp5], %[temp5], 16 \n\t"
"sra %[temp7], %[temp7], 16 \n\t"
"subu %[temp1], %[temp6], %[temp1] \n\t"
"addu %[temp7], %[temp5], %[temp7] \n\t"
"addu %[temp5], %[temp16], %[temp11] \n\t"
"subu %[temp16], %[temp16], %[temp11] \n\t"
"addu %[temp11], %[temp8], %[temp17] \n\t"
"subu %[temp8], %[temp8], %[temp17] \n\t"
"sra %[temp5], %[temp5], 3 \n\t"
"sra %[temp16], %[temp16], 3 \n\t"
"sra %[temp11], %[temp11], 3 \n\t"
"sra %[temp8], %[temp8], 3 \n\t"
"addu %[temp17], %[temp10], %[temp15] \n\t"
"subu %[temp10], %[temp10], %[temp15] \n\t"
"addu %[temp15], %[temp12], %[temp9] \n\t"
"subu %[temp12], %[temp12], %[temp9] \n\t"
"sra %[temp17], %[temp17], 3 \n\t"
"sra %[temp10], %[temp10], 3 \n\t"
"sra %[temp15], %[temp15], 3 \n\t"
"sra %[temp12], %[temp12], 3 \n\t"
"addu %[temp9], %[temp14], %[temp3] \n\t"
"subu %[temp14], %[temp14], %[temp3] \n\t"
"addu %[temp3], %[temp0], %[temp13] \n\t"
"subu %[temp0], %[temp0], %[temp13] \n\t"
"sra %[temp9], %[temp9], 3 \n\t"
"sra %[temp14], %[temp14], 3 \n\t"
"sra %[temp3], %[temp3], 3 \n\t"
"sra %[temp0], %[temp0], 3 \n\t"
"addu %[temp13], %[temp2], %[temp7] \n\t"
"subu %[temp2], %[temp2], %[temp7] \n\t"
"addu %[temp7], %[temp4], %[temp1] \n\t"
"subu %[temp4], %[temp4], %[temp1] \n\t"
"sra %[temp13], %[temp13], 3 \n\t"
"sra %[temp2], %[temp2], 3 \n\t"
"sra %[temp7], %[temp7], 3 \n\t"
"sra %[temp4], %[temp4], 3 \n\t"
"addiu %[temp6], $zero, 255 \n\t"
"lbu %[temp1], 0+0*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp1], %[temp1], %[temp5] \n\t"
"sra %[temp5], %[temp1], 8 \n\t"
"sra %[temp18], %[temp1], 31 \n\t"
"beqz %[temp5], 1f \n\t"
"xor %[temp1], %[temp1], %[temp1] \n\t"
"movz %[temp1], %[temp6], %[temp18] \n\t"
"1: \n\t"
"lbu %[temp18], 1+0*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp1], 0+0*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp18], %[temp18], %[temp11] \n\t"
"sra %[temp11], %[temp18], 8 \n\t"
"sra %[temp1], %[temp18], 31 \n\t"
"beqz %[temp11], 2f \n\t"
"xor %[temp18], %[temp18], %[temp18] \n\t"
"movz %[temp18], %[temp6], %[temp1] \n\t"
"2: \n\t"
"lbu %[temp1], 2+0*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp18], 1+0*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp1], %[temp1], %[temp8] \n\t"
"sra %[temp8], %[temp1], 8 \n\t"
"sra %[temp18], %[temp1], 31 \n\t"
"beqz %[temp8], 3f \n\t"
"xor %[temp1], %[temp1], %[temp1] \n\t"
"movz %[temp1], %[temp6], %[temp18] \n\t"
"3: \n\t"
"lbu %[temp18], 3+0*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp1], 2+0*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp18], %[temp18], %[temp16] \n\t"
"sra %[temp16], %[temp18], 8 \n\t"
"sra %[temp1], %[temp18], 31 \n\t"
"beqz %[temp16], 4f \n\t"
"xor %[temp18], %[temp18], %[temp18] \n\t"
"movz %[temp18], %[temp6], %[temp1] \n\t"
"4: \n\t"
"sb %[temp18], 3+0*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp5], 0+1*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp8], 1+1*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp11], 2+1*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp16], 3+1*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp5], %[temp5], %[temp17] \n\t"
"addu %[temp8], %[temp8], %[temp15] \n\t"
"addu %[temp11], %[temp11], %[temp12] \n\t"
"addu %[temp16], %[temp16], %[temp10] \n\t"
"sra %[temp18], %[temp5], 8 \n\t"
"sra %[temp1], %[temp5], 31 \n\t"
"beqz %[temp18], 5f \n\t"
"xor %[temp5], %[temp5], %[temp5] \n\t"
"movz %[temp5], %[temp6], %[temp1] \n\t"
"5: \n\t"
"sra %[temp18], %[temp8], 8 \n\t"
"sra %[temp1], %[temp8], 31 \n\t"
"beqz %[temp18], 6f \n\t"
"xor %[temp8], %[temp8], %[temp8] \n\t"
"movz %[temp8], %[temp6], %[temp1] \n\t"
"6: \n\t"
"sra %[temp18], %[temp11], 8 \n\t"
"sra %[temp1], %[temp11], 31 \n\t"
"sra %[temp17], %[temp16], 8 \n\t"
"sra %[temp15], %[temp16], 31 \n\t"
"beqz %[temp18], 7f \n\t"
"xor %[temp11], %[temp11], %[temp11] \n\t"
"movz %[temp11], %[temp6], %[temp1] \n\t"
"7: \n\t"
"beqz %[temp17], 8f \n\t"
"xor %[temp16], %[temp16], %[temp16] \n\t"
"movz %[temp16], %[temp6], %[temp15] \n\t"
"8: \n\t"
"sb %[temp5], 0+1*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp8], 1+1*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp11], 2+1*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp16], 3+1*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp5], 0+2*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp8], 1+2*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp11], 2+2*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp16], 3+2*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp5], %[temp5], %[temp9] \n\t"
"addu %[temp8], %[temp8], %[temp3] \n\t"
"addu %[temp11], %[temp11], %[temp0] \n\t"
"addu %[temp16], %[temp16], %[temp14] \n\t"
"sra %[temp18], %[temp5], 8 \n\t"
"sra %[temp1], %[temp5], 31 \n\t"
"sra %[temp17], %[temp8], 8 \n\t"
"sra %[temp15], %[temp8], 31 \n\t"
"sra %[temp12], %[temp11], 8 \n\t"
"sra %[temp10], %[temp11], 31 \n\t"
"sra %[temp9], %[temp16], 8 \n\t"
"sra %[temp3], %[temp16], 31 \n\t"
"beqz %[temp18], 9f \n\t"
"xor %[temp5], %[temp5], %[temp5] \n\t"
"movz %[temp5], %[temp6], %[temp1] \n\t"
"9: \n\t"
"beqz %[temp17], 10f \n\t"
"xor %[temp8], %[temp8], %[temp8] \n\t"
"movz %[temp8], %[temp6], %[temp15] \n\t"
"10: \n\t"
"beqz %[temp12], 11f \n\t"
"xor %[temp11], %[temp11], %[temp11] \n\t"
"movz %[temp11], %[temp6], %[temp10] \n\t"
"11: \n\t"
"beqz %[temp9], 12f \n\t"
"xor %[temp16], %[temp16], %[temp16] \n\t"
"movz %[temp16], %[temp6], %[temp3] \n\t"
"12: \n\t"
"sb %[temp5], 0+2*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp8], 1+2*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp11], 2+2*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp16], 3+2*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp5], 0+3*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp8], 1+3*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp11], 2+3*" XSTR(BPS) "(%[dst]) \n\t"
"lbu %[temp16], 3+3*" XSTR(BPS) "(%[dst]) \n\t"
"addu %[temp5], %[temp5], %[temp13] \n\t"
"addu %[temp8], %[temp8], %[temp7] \n\t"
"addu %[temp11], %[temp11], %[temp4] \n\t"
"addu %[temp16], %[temp16], %[temp2] \n\t"
"sra %[temp18], %[temp5], 8 \n\t"
"sra %[temp1], %[temp5], 31 \n\t"
"sra %[temp17], %[temp8], 8 \n\t"
"sra %[temp15], %[temp8], 31 \n\t"
"sra %[temp12], %[temp11], 8 \n\t"
"sra %[temp10], %[temp11], 31 \n\t"
"sra %[temp9], %[temp16], 8 \n\t"
"sra %[temp3], %[temp16], 31 \n\t"
"beqz %[temp18], 13f \n\t"
"xor %[temp5], %[temp5], %[temp5] \n\t"
"movz %[temp5], %[temp6], %[temp1] \n\t"
"13: \n\t"
"beqz %[temp17], 14f \n\t"
"xor %[temp8], %[temp8], %[temp8] \n\t"
"movz %[temp8], %[temp6], %[temp15] \n\t"
"14: \n\t"
"beqz %[temp12], 15f \n\t"
"xor %[temp11], %[temp11], %[temp11] \n\t"
"movz %[temp11], %[temp6], %[temp10] \n\t"
"15: \n\t"
"beqz %[temp9], 16f \n\t"
"xor %[temp16], %[temp16], %[temp16] \n\t"
"movz %[temp16], %[temp6], %[temp3] \n\t"
"16: \n\t"
"sb %[temp5], 0+3*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp8], 1+3*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp11], 2+3*" XSTR(BPS) "(%[dst]) \n\t"
"sb %[temp16], 3+3*" XSTR(BPS) "(%[dst]) \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
[temp18]"=&r"(temp18)
: [in]"r"(p_in), [kC1]"r"(kC1), [kC2]"r"(kC2), [dst]"r"(dst)
: "memory", "hi", "lo"
);
}
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8DspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
VP8InitClipTables();
VP8Transform = TransformTwo;
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
VP8VFilter8 = VFilter8;
VP8HFilter8 = HFilter8;
VP8VFilter16i = VFilter16i;
VP8HFilter16i = HFilter16i;
VP8VFilter8i = VFilter8i;
VP8HFilter8i = HFilter8i;
VP8SimpleVFilter16 = SimpleVFilter16;
VP8SimpleHFilter16 = SimpleHFilter16;
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
#endif // WEBP_USE_MIPS32

View File

@ -0,0 +1,994 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of dsp functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "dsp_mips_macro.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
#define MUL(a, b) (((a) * (b)) >> 16)
static void TransformDC(const int16_t* in, uint8_t* dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
__asm__ volatile (
LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
0, 0, 0, 0,
0, 1, 2, 3,
BPS)
"lh %[temp5], 0(%[in]) \n\t"
"addiu %[temp5], %[temp5], 4 \n\t"
"ins %[temp5], %[temp5], 16, 16 \n\t"
"shra.ph %[temp5], %[temp5], 3 \n\t"
CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
temp3, temp1, temp2, temp3, temp4)
STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
dst, 0, 1, 2, 3, BPS)
OUTPUT_EARLY_CLOBBER_REGS_10()
: [in]"r"(in), [dst]"r"(dst)
: "memory"
);
}
static void TransformAC3(const int16_t* in, uint8_t* dst) {
const int a = in[0] + 4;
int c4 = MUL(in[4], kC2);
const int d4 = MUL(in[4], kC1);
const int c1 = MUL(in[1], kC2);
const int d1 = MUL(in[1], kC1);
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
__asm__ volatile (
"ins %[c4], %[d4], 16, 16 \n\t"
"replv.ph %[temp1], %[a] \n\t"
"replv.ph %[temp4], %[d1] \n\t"
ADD_SUB_HALVES(temp2, temp3, temp1, c4)
"replv.ph %[temp5], %[c1] \n\t"
SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
0, 0, 0, 0,
0, 1, 2, 3,
BPS)
CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
temp11, temp17, temp3, temp5, temp11, temp12)
PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
temp4, temp7, temp6, temp10, temp9)
STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
temp17, temp12, temp18, temp1, temp8, temp2, temp4,
temp7, temp6, dst, 0, 1, 2, 3, BPS)
OUTPUT_EARLY_CLOBBER_REGS_18(),
[c4]"+&r"(c4)
: [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
: "memory"
);
}
static void TransformOne(const int16_t* in, uint8_t* dst) {
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
__asm__ volatile (
"ulw %[temp1], 0(%[in]) \n\t"
"ulw %[temp2], 16(%[in]) \n\t"
LOAD_IN_X2(temp5, temp6, 24, 26)
ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
LOAD_IN_X2(temp1, temp2, 8, 10)
MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
temp13, temp11, temp14, temp12)
INSERT_HALF_X2(temp8, temp7, temp10, temp9)
"ulw %[temp17], 4(%[in]) \n\t"
"ulw %[temp18], 20(%[in]) \n\t"
ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
LOAD_IN_X2(temp17, temp18, 12, 14)
LOAD_IN_X2(temp9, temp10, 28, 30)
MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
temp15, temp4, temp16, temp17)
INSERT_HALF_X2(temp11, temp12, temp13, temp14)
ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
// horizontal
SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
INSERT_HALF_X2(temp1, temp6, temp5, temp2)
SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
"repl.ph %[temp2], 0x4 \n\t"
INSERT_HALF_X2(temp3, temp8, temp17, temp4)
"addq.ph %[temp1], %[temp1], %[temp2] \n\t"
"addq.ph %[temp6], %[temp6], %[temp2] \n\t"
ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
temp6, temp17, temp8, temp18)
MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
temp18, temp12, temp17, temp16)
INSERT_HALF_X2(temp1, temp3, temp9, temp13)
INSERT_HALF_X2(temp6, temp8, temp11, temp15)
SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
temp6)
PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
temp16, temp11, temp10, temp15, temp14)
LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
0, 0, 0, 0,
0, 1, 2, 3,
BPS)
CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
temp11, temp10, temp11, temp14, temp15)
STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
dst, 0, 1, 2, 3, BPS)
OUTPUT_EARLY_CLOBBER_REGS_18()
: [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
: "memory", "hi", "lo"
);
}
static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
TransformOne(in, dst);
if (do_two) {
TransformOne(in + 16, dst + 4);
}
}
static WEBP_INLINE void FilterLoop26(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
const int thresh2 = 2 * thresh + 1;
int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
int temp10, temp11, temp12, temp13, temp14, temp15;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"1: \n\t"
"negu %[temp1], %[hstride] \n\t"
"addiu %[size], %[size], -1 \n\t"
"sll %[temp2], %[hstride], 1 \n\t"
"sll %[temp3], %[temp1], 1 \n\t"
"addu %[temp4], %[temp2], %[hstride] \n\t"
"addu %[temp5], %[temp3], %[temp1] \n\t"
"lbu %[temp7], 0(%[p]) \n\t"
"sll %[temp6], %[temp3], 1 \n\t"
"lbux %[temp8], %[temp5](%[p]) \n\t"
"lbux %[temp9], %[temp3](%[p]) \n\t"
"lbux %[temp10], %[temp1](%[p]) \n\t"
"lbux %[temp11], %[temp6](%[p]) \n\t"
"lbux %[temp12], %[hstride](%[p]) \n\t"
"lbux %[temp13], %[temp2](%[p]) \n\t"
"lbux %[temp14], %[temp4](%[p]) \n\t"
"subu %[temp1], %[temp10], %[temp7] \n\t"
"subu %[temp2], %[temp9], %[temp12] \n\t"
"absq_s.w %[temp3], %[temp1] \n\t"
"absq_s.w %[temp4], %[temp2] \n\t"
"negu %[temp1], %[temp1] \n\t"
"sll %[temp3], %[temp3], 2 \n\t"
"addu %[temp15], %[temp3], %[temp4] \n\t"
"subu %[temp3], %[temp15], %[thresh2] \n\t"
"sll %[temp6], %[temp1], 1 \n\t"
"bgtz %[temp3], 3f \n\t"
" subu %[temp4], %[temp11], %[temp8] \n\t"
"absq_s.w %[temp4], %[temp4] \n\t"
"shll_s.w %[temp2], %[temp2], 24 \n\t"
"subu %[temp4], %[temp4], %[ithresh] \n\t"
"bgtz %[temp4], 3f \n\t"
" subu %[temp3], %[temp8], %[temp9] \n\t"
"absq_s.w %[temp3], %[temp3] \n\t"
"subu %[temp3], %[temp3], %[ithresh] \n\t"
"bgtz %[temp3], 3f \n\t"
" subu %[temp5], %[temp9], %[temp10] \n\t"
"absq_s.w %[temp3], %[temp5] \n\t"
"absq_s.w %[temp5], %[temp5] \n\t"
"subu %[temp3], %[temp3], %[ithresh] \n\t"
"bgtz %[temp3], 3f \n\t"
" subu %[temp3], %[temp14], %[temp13] \n\t"
"absq_s.w %[temp3], %[temp3] \n\t"
"slt %[temp5], %[hev_thresh], %[temp5] \n\t"
"subu %[temp3], %[temp3], %[ithresh] \n\t"
"bgtz %[temp3], 3f \n\t"
" subu %[temp3], %[temp13], %[temp12] \n\t"
"absq_s.w %[temp3], %[temp3] \n\t"
"sra %[temp4], %[temp2], 24 \n\t"
"subu %[temp3], %[temp3], %[ithresh] \n\t"
"bgtz %[temp3], 3f \n\t"
" subu %[temp15], %[temp12], %[temp7] \n\t"
"absq_s.w %[temp3], %[temp15] \n\t"
"absq_s.w %[temp15], %[temp15] \n\t"
"subu %[temp3], %[temp3], %[ithresh] \n\t"
"bgtz %[temp3], 3f \n\t"
" slt %[temp15], %[hev_thresh], %[temp15] \n\t"
"addu %[temp3], %[temp6], %[temp1] \n\t"
"or %[temp2], %[temp5], %[temp15] \n\t"
"addu %[temp5], %[temp4], %[temp3] \n\t"
"beqz %[temp2], 4f \n\t"
" shra_r.w %[temp1], %[temp5], 3 \n\t"
"addiu %[temp2], %[temp5], 3 \n\t"
"sra %[temp2], %[temp2], 3 \n\t"
"shll_s.w %[temp1], %[temp1], 27 \n\t"
"shll_s.w %[temp2], %[temp2], 27 \n\t"
"subu %[temp3], %[p], %[hstride] \n\t"
"sra %[temp1], %[temp1], 27 \n\t"
"sra %[temp2], %[temp2], 27 \n\t"
"subu %[temp1], %[temp7], %[temp1] \n\t"
"addu %[temp2], %[temp10], %[temp2] \n\t"
"lbux %[temp2], %[temp2](%[VP8kclip1]) \n\t"
"lbux %[temp1], %[temp1](%[VP8kclip1]) \n\t"
"sb %[temp2], 0(%[temp3]) \n\t"
"j 3f \n\t"
" sb %[temp1], 0(%[p]) \n\t"
"4: \n\t"
"shll_s.w %[temp5], %[temp5], 24 \n\t"
"subu %[temp14], %[p], %[hstride] \n\t"
"subu %[temp11], %[temp14], %[hstride] \n\t"
"sra %[temp6], %[temp5], 24 \n\t"
"sll %[temp1], %[temp6], 3 \n\t"
"subu %[temp15], %[temp11], %[hstride] \n\t"
"addu %[temp2], %[temp6], %[temp1] \n\t"
"sll %[temp3], %[temp2], 1 \n\t"
"addu %[temp4], %[temp3], %[temp2] \n\t"
"addiu %[temp2], %[temp2], 63 \n\t"
"addiu %[temp3], %[temp3], 63 \n\t"
"addiu %[temp4], %[temp4], 63 \n\t"
"sra %[temp2], %[temp2], 7 \n\t"
"sra %[temp3], %[temp3], 7 \n\t"
"sra %[temp4], %[temp4], 7 \n\t"
"addu %[temp1], %[temp8], %[temp2] \n\t"
"addu %[temp5], %[temp9], %[temp3] \n\t"
"addu %[temp6], %[temp10], %[temp4] \n\t"
"subu %[temp8], %[temp7], %[temp4] \n\t"
"subu %[temp7], %[temp12], %[temp3] \n\t"
"addu %[temp10], %[p], %[hstride] \n\t"
"subu %[temp9], %[temp13], %[temp2] \n\t"
"addu %[temp12], %[temp10], %[hstride] \n\t"
"lbux %[temp2], %[temp1](%[VP8kclip1]) \n\t"
"lbux %[temp3], %[temp5](%[VP8kclip1]) \n\t"
"lbux %[temp4], %[temp6](%[VP8kclip1]) \n\t"
"lbux %[temp5], %[temp8](%[VP8kclip1]) \n\t"
"lbux %[temp6], %[temp7](%[VP8kclip1]) \n\t"
"lbux %[temp8], %[temp9](%[VP8kclip1]) \n\t"
"sb %[temp2], 0(%[temp15]) \n\t"
"sb %[temp3], 0(%[temp11]) \n\t"
"sb %[temp4], 0(%[temp14]) \n\t"
"sb %[temp5], 0(%[p]) \n\t"
"sb %[temp6], 0(%[temp10]) \n\t"
"sb %[temp8], 0(%[temp12]) \n\t"
"3: \n\t"
"bgtz %[size], 1b \n\t"
" addu %[p], %[p], %[vstride] \n\t"
".set pop \n\t"
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
[temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
[temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
[temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
[size]"+&r"(size), [p]"+&r"(p)
: [hstride]"r"(hstride), [thresh2]"r"(thresh2),
[ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
[VP8kclip1]"r"(VP8kclip1)
: "memory"
);
}
static WEBP_INLINE void FilterLoop24(uint8_t* p,
int hstride, int vstride, int size,
int thresh, int ithresh, int hev_thresh) {
int p0, q0, p1, q1, p2, q2, p3, q3;
int step1, step2, temp1, temp2, temp3, temp4;
uint8_t* pTemp0;
uint8_t* pTemp1;
const int thresh2 = 2 * thresh + 1;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"bltz %[size], 3f \n\t"
" nop \n\t"
"2: \n\t"
"negu %[step1], %[hstride] \n\t"
"lbu %[q0], 0(%[p]) \n\t"
"lbux %[p0], %[step1](%[p]) \n\t"
"subu %[step1], %[step1], %[hstride] \n\t"
"lbux %[q1], %[hstride](%[p]) \n\t"
"subu %[temp1], %[p0], %[q0] \n\t"
"lbux %[p1], %[step1](%[p]) \n\t"
"addu %[step2], %[hstride], %[hstride] \n\t"
"absq_s.w %[temp2], %[temp1] \n\t"
"subu %[temp3], %[p1], %[q1] \n\t"
"absq_s.w %[temp4], %[temp3] \n\t"
"sll %[temp2], %[temp2], 2 \n\t"
"addu %[temp2], %[temp2], %[temp4] \n\t"
"subu %[temp4], %[temp2], %[thresh2] \n\t"
"subu %[step1], %[step1], %[hstride] \n\t"
"bgtz %[temp4], 0f \n\t"
" lbux %[p2], %[step1](%[p]) \n\t"
"subu %[step1], %[step1], %[hstride] \n\t"
"lbux %[q2], %[step2](%[p]) \n\t"
"lbux %[p3], %[step1](%[p]) \n\t"
"subu %[temp4], %[p2], %[p1] \n\t"
"addu %[step2], %[step2], %[hstride] \n\t"
"subu %[temp2], %[p3], %[p2] \n\t"
"absq_s.w %[temp4], %[temp4] \n\t"
"absq_s.w %[temp2], %[temp2] \n\t"
"lbux %[q3], %[step2](%[p]) \n\t"
"subu %[temp4], %[temp4], %[ithresh] \n\t"
"negu %[temp1], %[temp1] \n\t"
"bgtz %[temp4], 0f \n\t"
" subu %[temp2], %[temp2], %[ithresh] \n\t"
"subu %[p3], %[p1], %[p0] \n\t"
"bgtz %[temp2], 0f \n\t"
" absq_s.w %[p3], %[p3] \n\t"
"subu %[temp4], %[q3], %[q2] \n\t"
"subu %[pTemp0], %[p], %[hstride] \n\t"
"absq_s.w %[temp4], %[temp4] \n\t"
"subu %[temp2], %[p3], %[ithresh] \n\t"
"sll %[step1], %[temp1], 1 \n\t"
"bgtz %[temp2], 0f \n\t"
" subu %[temp4], %[temp4], %[ithresh] \n\t"
"subu %[temp2], %[q2], %[q1] \n\t"
"bgtz %[temp4], 0f \n\t"
" absq_s.w %[temp2], %[temp2] \n\t"
"subu %[q3], %[q1], %[q0] \n\t"
"absq_s.w %[q3], %[q3] \n\t"
"subu %[temp2], %[temp2], %[ithresh] \n\t"
"addu %[temp1], %[temp1], %[step1] \n\t"
"bgtz %[temp2], 0f \n\t"
" subu %[temp4], %[q3], %[ithresh] \n\t"
"slt %[p3], %[hev_thresh], %[p3] \n\t"
"bgtz %[temp4], 0f \n\t"
" slt %[q3], %[hev_thresh], %[q3] \n\t"
"or %[q3], %[q3], %[p3] \n\t"
"bgtz %[q3], 1f \n\t"
" shra_r.w %[temp2], %[temp1], 3 \n\t"
"addiu %[temp1], %[temp1], 3 \n\t"
"sra %[temp1], %[temp1], 3 \n\t"
"shll_s.w %[temp2], %[temp2], 27 \n\t"
"shll_s.w %[temp1], %[temp1], 27 \n\t"
"addu %[pTemp1], %[p], %[hstride] \n\t"
"sra %[temp2], %[temp2], 27 \n\t"
"sra %[temp1], %[temp1], 27 \n\t"
"addiu %[step1], %[temp2], 1 \n\t"
"sra %[step1], %[step1], 1 \n\t"
"addu %[p0], %[p0], %[temp1] \n\t"
"addu %[p1], %[p1], %[step1] \n\t"
"subu %[q0], %[q0], %[temp2] \n\t"
"subu %[q1], %[q1], %[step1] \n\t"
"lbux %[temp2], %[p0](%[VP8kclip1]) \n\t"
"lbux %[temp3], %[q0](%[VP8kclip1]) \n\t"
"lbux %[temp4], %[q1](%[VP8kclip1]) \n\t"
"sb %[temp2], 0(%[pTemp0]) \n\t"
"lbux %[temp1], %[p1](%[VP8kclip1]) \n\t"
"subu %[pTemp0], %[pTemp0], %[hstride] \n\t"
"sb %[temp3], 0(%[p]) \n\t"
"sb %[temp4], 0(%[pTemp1]) \n\t"
"j 0f \n\t"
" sb %[temp1], 0(%[pTemp0]) \n\t"
"1: \n\t"
"shll_s.w %[temp3], %[temp3], 24 \n\t"
"sra %[temp3], %[temp3], 24 \n\t"
"addu %[temp1], %[temp1], %[temp3] \n\t"
"shra_r.w %[temp2], %[temp1], 3 \n\t"
"addiu %[temp1], %[temp1], 3 \n\t"
"shll_s.w %[temp2], %[temp2], 27 \n\t"
"sra %[temp1], %[temp1], 3 \n\t"
"shll_s.w %[temp1], %[temp1], 27 \n\t"
"sra %[temp2], %[temp2], 27 \n\t"
"sra %[temp1], %[temp1], 27 \n\t"
"addu %[p0], %[p0], %[temp1] \n\t"
"subu %[q0], %[q0], %[temp2] \n\t"
"lbux %[temp1], %[p0](%[VP8kclip1]) \n\t"
"lbux %[temp2], %[q0](%[VP8kclip1]) \n\t"
"sb %[temp2], 0(%[p]) \n\t"
"sb %[temp1], 0(%[pTemp0]) \n\t"
"0: \n\t"
"subu %[size], %[size], 1 \n\t"
"bgtz %[size], 2b \n\t"
" addu %[p], %[p], %[vstride] \n\t"
"3: \n\t"
".set pop \n\t"
: [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
[p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
[step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
[pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
[size]"+&r"(size)
: [vstride]"r"(vstride), [ithresh]"r"(ithresh),
[hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
[VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
: "memory"
);
}
// on macroblock edges
static void VFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
static void HFilter16(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
// 8-pixels wide variant, for chroma filtering
static void VFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
}
// on three inner edges
static void VFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
}
}
static void HFilter16i(uint8_t* p, int stride,
int thresh, int ithresh, int hev_thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
}
}
static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
}
static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_thresh) {
FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
}
#undef MUL
//------------------------------------------------------------------------------
// Simple In-loop filtering (Paragraph 15.2)
static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
uint8_t* p1 = p - stride;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"li %[i], 16 \n\t"
"0: \n\t"
"negu %[temp4], %[stride] \n\t"
"sll %[temp5], %[temp4], 1 \n\t"
"lbu %[temp2], 0(%[p]) \n\t"
"lbux %[temp3], %[stride](%[p]) \n\t"
"lbux %[temp1], %[temp4](%[p]) \n\t"
"lbux %[temp0], %[temp5](%[p]) \n\t"
"subu %[temp7], %[temp1], %[temp2] \n\t"
"subu %[temp6], %[temp0], %[temp3] \n\t"
"absq_s.w %[temp4], %[temp7] \n\t"
"absq_s.w %[temp5], %[temp6] \n\t"
"sll %[temp4], %[temp4], 2 \n\t"
"subu %[temp5], %[temp5], %[thresh2] \n\t"
"addu %[temp5], %[temp4], %[temp5] \n\t"
"negu %[temp8], %[temp7] \n\t"
"bgtz %[temp5], 1f \n\t"
" addiu %[i], %[i], -1 \n\t"
"sll %[temp4], %[temp8], 1 \n\t"
"shll_s.w %[temp5], %[temp6], 24 \n\t"
"addu %[temp3], %[temp4], %[temp8] \n\t"
"sra %[temp5], %[temp5], 24 \n\t"
"addu %[temp3], %[temp3], %[temp5] \n\t"
"addiu %[temp7], %[temp3], 3 \n\t"
"sra %[temp7], %[temp7], 3 \n\t"
"shra_r.w %[temp8], %[temp3], 3 \n\t"
"shll_s.w %[temp0], %[temp7], 27 \n\t"
"shll_s.w %[temp4], %[temp8], 27 \n\t"
"sra %[temp0], %[temp0], 27 \n\t"
"sra %[temp4], %[temp4], 27 \n\t"
"addu %[temp7], %[temp1], %[temp0] \n\t"
"subu %[temp2], %[temp2], %[temp4] \n\t"
"lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
"lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
"sb %[temp3], 0(%[p1]) \n\t"
"sb %[temp4], 0(%[p]) \n\t"
"1: \n\t"
"addiu %[p1], %[p1], 1 \n\t"
"bgtz %[i], 0b \n\t"
" addiu %[p], %[p], 1 \n\t"
" .set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
: [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
: "memory"
);
}
// TEMP0 = SRC[A + A1 * BPS]
// TEMP1 = SRC[B + B1 * BPS]
// TEMP2 = SRC[C + C1 * BPS]
// TEMP3 = SRC[D + D1 * BPS]
#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3, \
A, A1, B, B1, C, C1, D, D1, SRC) \
"lbu %[" #TEMP0 "], " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
"lbu %[" #TEMP1 "], " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
"lbu %[" #TEMP2 "], " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
"lbu %[" #TEMP3 "], " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
int i;
const int thresh2 = 2 * thresh + 1;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"li %[i], 16 \n\t"
"0: \n\t"
LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
"subu %[temp7], %[temp1], %[temp2] \n\t"
"subu %[temp6], %[temp0], %[temp3] \n\t"
"absq_s.w %[temp4], %[temp7] \n\t"
"absq_s.w %[temp5], %[temp6] \n\t"
"sll %[temp4], %[temp4], 2 \n\t"
"addu %[temp5], %[temp4], %[temp5] \n\t"
"subu %[temp5], %[temp5], %[thresh2] \n\t"
"negu %[temp8], %[temp7] \n\t"
"bgtz %[temp5], 1f \n\t"
" addiu %[i], %[i], -1 \n\t"
"sll %[temp4], %[temp8], 1 \n\t"
"shll_s.w %[temp5], %[temp6], 24 \n\t"
"addu %[temp3], %[temp4], %[temp8] \n\t"
"sra %[temp5], %[temp5], 24 \n\t"
"addu %[temp3], %[temp3], %[temp5] \n\t"
"addiu %[temp7], %[temp3], 3 \n\t"
"sra %[temp7], %[temp7], 3 \n\t"
"shra_r.w %[temp8], %[temp3], 3 \n\t"
"shll_s.w %[temp0], %[temp7], 27 \n\t"
"shll_s.w %[temp4], %[temp8], 27 \n\t"
"sra %[temp0], %[temp0], 27 \n\t"
"sra %[temp4], %[temp4], 27 \n\t"
"addu %[temp7], %[temp1], %[temp0] \n\t"
"subu %[temp2], %[temp2], %[temp4] \n\t"
"lbux %[temp3], %[temp7](%[VP8kclip1]) \n\t"
"lbux %[temp4], %[temp2](%[VP8kclip1]) \n\t"
"sb %[temp3], -1(%[p]) \n\t"
"sb %[temp4], 0(%[p]) \n\t"
"1: \n\t"
"bgtz %[i], 0b \n\t"
" addu %[p], %[p], %[stride] \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[p]"+&r"(p), [i]"=&r"(i)
: [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
: "memory"
);
}
static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4 * stride;
SimpleVFilter16(p, stride, thresh);
}
}
static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
int k;
for (k = 3; k > 0; --k) {
p += 4;
SimpleHFilter16(p, stride, thresh);
}
}
// DST[A * BPS] = TEMP0
// DST[B + C * BPS] = TEMP1
#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST) \
"usw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #DST "]) \n\t" \
"usw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #DST "]) \n\t"
static void VE4(uint8_t* dst) { // vertical
const uint8_t* top = dst - BPS;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
__asm__ volatile (
"ulw %[temp0], -1(%[top]) \n\t"
"ulh %[temp1], 3(%[top]) \n\t"
"preceu.ph.qbr %[temp2], %[temp0] \n\t"
"preceu.ph.qbl %[temp3], %[temp0] \n\t"
"preceu.ph.qbr %[temp4], %[temp1] \n\t"
"packrl.ph %[temp5], %[temp3], %[temp2] \n\t"
"packrl.ph %[temp6], %[temp4], %[temp3] \n\t"
"shll.ph %[temp5], %[temp5], 1 \n\t"
"shll.ph %[temp6], %[temp6], 1 \n\t"
"addq.ph %[temp2], %[temp5], %[temp2] \n\t"
"addq.ph %[temp6], %[temp6], %[temp4] \n\t"
"addq.ph %[temp2], %[temp2], %[temp3] \n\t"
"addq.ph %[temp6], %[temp6], %[temp3] \n\t"
"shra_r.ph %[temp2], %[temp2], 2 \n\t"
"shra_r.ph %[temp6], %[temp6], 2 \n\t"
"precr.qb.ph %[temp4], %[temp6], %[temp2] \n\t"
STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6)
: [top]"r"(top), [dst]"r"(dst)
: "memory"
);
}
static void DC4(uint8_t* dst) { // DC
int temp0, temp1, temp2, temp3, temp4;
__asm__ volatile (
"ulw %[temp0], -1*" XSTR(BPS) "(%[dst]) \n\t"
LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
"ins %[temp1], %[temp2], 8, 8 \n\t"
"ins %[temp1], %[temp3], 16, 8 \n\t"
"ins %[temp1], %[temp4], 24, 8 \n\t"
"raddu.w.qb %[temp0], %[temp0] \n\t"
"raddu.w.qb %[temp1], %[temp1] \n\t"
"addu %[temp0], %[temp0], %[temp1] \n\t"
"shra_r.w %[temp0], %[temp0], 3 \n\t"
"replv.qb %[temp0], %[temp0] \n\t"
STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
: [dst]"r"(dst)
: "memory"
);
}
static void RD4(uint8_t* dst) { // Down-right
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8;
__asm__ volatile (
LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
"ulw %[temp7], -1-" XSTR(BPS) "(%[dst]) \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"preceu.ph.qbr %[temp5], %[temp7] \n\t"
"ins %[temp2], %[temp1], 16, 16 \n\t"
"preceu.ph.qbl %[temp4], %[temp7] \n\t"
"ins %[temp3], %[temp2], 16, 16 \n\t"
"shll.ph %[temp2], %[temp2], 1 \n\t"
"addq.ph %[temp3], %[temp3], %[temp1] \n\t"
"packrl.ph %[temp6], %[temp5], %[temp1] \n\t"
"addq.ph %[temp3], %[temp3], %[temp2] \n\t"
"addq.ph %[temp1], %[temp1], %[temp5] \n\t"
"shll.ph %[temp6], %[temp6], 1 \n\t"
"addq.ph %[temp1], %[temp1], %[temp6] \n\t"
"packrl.ph %[temp0], %[temp4], %[temp5] \n\t"
"addq.ph %[temp8], %[temp5], %[temp4] \n\t"
"shra_r.ph %[temp3], %[temp3], 2 \n\t"
"shll.ph %[temp0], %[temp0], 1 \n\t"
"shra_r.ph %[temp1], %[temp1], 2 \n\t"
"addq.ph %[temp8], %[temp0], %[temp8] \n\t"
"lbu %[temp5], 3-" XSTR(BPS) "(%[dst]) \n\t"
"precrq.ph.w %[temp7], %[temp7], %[temp7] \n\t"
"shra_r.ph %[temp8], %[temp8], 2 \n\t"
"ins %[temp7], %[temp5], 0, 8 \n\t"
"precr.qb.ph %[temp2], %[temp1], %[temp3] \n\t"
"raddu.w.qb %[temp4], %[temp7] \n\t"
"precr.qb.ph %[temp6], %[temp8], %[temp1] \n\t"
"shra_r.w %[temp4], %[temp4], 2 \n\t"
STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
"prepend %[temp2], %[temp8], 8 \n\t"
"prepend %[temp6], %[temp4], 8 \n\t"
STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
: [dst]"r"(dst)
: "memory"
);
}
// TEMP0 = SRC[A * BPS]
// TEMP1 = SRC[B + C * BPS]
#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC) \
"ulw %[" #TEMP0 "], " #A "*" XSTR(BPS) "(%[" #SRC "]) \n\t" \
"ulw %[" #TEMP1 "], " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "]) \n\t"
static void LD4(uint8_t* dst) { // Down-Left
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
__asm__ volatile (
LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
"preceu.ph.qbl %[temp2], %[temp0] \n\t"
"preceu.ph.qbr %[temp3], %[temp0] \n\t"
"preceu.ph.qbr %[temp4], %[temp1] \n\t"
"preceu.ph.qbl %[temp5], %[temp1] \n\t"
"packrl.ph %[temp6], %[temp2], %[temp3] \n\t"
"packrl.ph %[temp7], %[temp4], %[temp2] \n\t"
"packrl.ph %[temp8], %[temp5], %[temp4] \n\t"
"shll.ph %[temp6], %[temp6], 1 \n\t"
"addq.ph %[temp9], %[temp2], %[temp6] \n\t"
"shll.ph %[temp7], %[temp7], 1 \n\t"
"addq.ph %[temp9], %[temp9], %[temp3] \n\t"
"shll.ph %[temp8], %[temp8], 1 \n\t"
"shra_r.ph %[temp9], %[temp9], 2 \n\t"
"addq.ph %[temp3], %[temp4], %[temp7] \n\t"
"addq.ph %[temp0], %[temp5], %[temp8] \n\t"
"addq.ph %[temp3], %[temp3], %[temp2] \n\t"
"addq.ph %[temp0], %[temp0], %[temp4] \n\t"
"shra_r.ph %[temp3], %[temp3], 2 \n\t"
"shra_r.ph %[temp0], %[temp0], 2 \n\t"
"srl %[temp1], %[temp1], 24 \n\t"
"sll %[temp1], %[temp1], 1 \n\t"
"raddu.w.qb %[temp5], %[temp5] \n\t"
"precr.qb.ph %[temp9], %[temp3], %[temp9] \n\t"
"precr.qb.ph %[temp3], %[temp0], %[temp3] \n\t"
"addu %[temp1], %[temp1], %[temp5] \n\t"
"shra_r.w %[temp1], %[temp1], 2 \n\t"
STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
"prepend %[temp9], %[temp0], 8 \n\t"
"prepend %[temp3], %[temp1], 8 \n\t"
STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9)
: [dst]"r"(dst)
: "memory"
);
}
//------------------------------------------------------------------------------
// Chroma
static void DC8uv(uint8_t* dst) { // DC
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8, temp9;
__asm__ volatile (
LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
"raddu.w.qb %[temp0], %[temp0] \n\t"
"raddu.w.qb %[temp1], %[temp1] \n\t"
"addu %[temp2], %[temp2], %[temp3] \n\t"
"addu %[temp4], %[temp4], %[temp5] \n\t"
"addu %[temp6], %[temp6], %[temp7] \n\t"
"addu %[temp8], %[temp8], %[temp9] \n\t"
"addu %[temp0], %[temp0], %[temp1] \n\t"
"addu %[temp2], %[temp2], %[temp4] \n\t"
"addu %[temp6], %[temp6], %[temp8] \n\t"
"addu %[temp0], %[temp0], %[temp2] \n\t"
"addu %[temp0], %[temp0], %[temp6] \n\t"
"shra_r.w %[temp0], %[temp0], 4 \n\t"
"replv.qb %[temp0], %[temp0] \n\t"
STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9)
: [dst]"r"(dst)
: "memory"
);
}
static void DC8uvNoLeft(uint8_t* dst) { // DC with no left samples
int temp0, temp1;
__asm__ volatile (
LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
"raddu.w.qb %[temp0], %[temp0] \n\t"
"raddu.w.qb %[temp1], %[temp1] \n\t"
"addu %[temp0], %[temp0], %[temp1] \n\t"
"shra_r.w %[temp0], %[temp0], 3 \n\t"
"replv.qb %[temp0], %[temp0] \n\t"
STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
: [dst]"r"(dst)
: "memory"
);
}
static void DC8uvNoTop(uint8_t* dst) { // DC with no top samples
int temp0, temp1, temp2, temp3, temp4;
int temp5, temp6, temp7, temp8;
__asm__ volatile (
LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
"addu %[temp2], %[temp2], %[temp3] \n\t"
"addu %[temp4], %[temp4], %[temp5] \n\t"
"addu %[temp6], %[temp6], %[temp7] \n\t"
"addu %[temp8], %[temp8], %[temp1] \n\t"
"addu %[temp2], %[temp2], %[temp4] \n\t"
"addu %[temp6], %[temp6], %[temp8] \n\t"
"addu %[temp0], %[temp6], %[temp2] \n\t"
"shra_r.w %[temp0], %[temp0], 3 \n\t"
"replv.qb %[temp0], %[temp0] \n\t"
STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
: [dst]"r"(dst)
: "memory"
);
}
#undef LOAD_8_BYTES
#undef STORE_8_BYTES
#undef LOAD_4_BYTES
#define CLIPPING(SIZE) \
"preceu.ph.qbl %[temp2], %[temp0] \n\t" \
"preceu.ph.qbr %[temp0], %[temp0] \n\t" \
".if " #SIZE " == 8 \n\t" \
"preceu.ph.qbl %[temp3], %[temp1] \n\t" \
"preceu.ph.qbr %[temp1], %[temp1] \n\t" \
".endif \n\t" \
"addu.ph %[temp2], %[temp2], %[dst_1] \n\t" \
"addu.ph %[temp0], %[temp0], %[dst_1] \n\t" \
".if " #SIZE " == 8 \n\t" \
"addu.ph %[temp3], %[temp3], %[dst_1] \n\t" \
"addu.ph %[temp1], %[temp1], %[dst_1] \n\t" \
".endif \n\t" \
"shll_s.ph %[temp2], %[temp2], 7 \n\t" \
"shll_s.ph %[temp0], %[temp0], 7 \n\t" \
".if " #SIZE " == 8 \n\t" \
"shll_s.ph %[temp3], %[temp3], 7 \n\t" \
"shll_s.ph %[temp1], %[temp1], 7 \n\t" \
".endif \n\t" \
"precrqu_s.qb.ph %[temp0], %[temp2], %[temp0] \n\t" \
".if " #SIZE " == 8 \n\t" \
"precrqu_s.qb.ph %[temp1], %[temp3], %[temp1] \n\t" \
".endif \n\t"
#define CLIP_8B_TO_DST(DST, TOP, SIZE) do { \
int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1]; \
int temp0, temp1, temp2, temp3; \
__asm__ volatile ( \
".if " #SIZE " < 8 \n\t" \
"ulw %[temp0], 0(%[top]) \n\t" \
"subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
CLIPPING(4) \
"usw %[temp0], 0(%[dst]) \n\t" \
".else \n\t" \
"ulw %[temp0], 0(%[top]) \n\t" \
"ulw %[temp1], 4(%[top]) \n\t" \
"subu.ph %[dst_1], %[dst_1], %[top_1] \n\t" \
CLIPPING(8) \
"usw %[temp0], 0(%[dst]) \n\t" \
"usw %[temp1], 4(%[dst]) \n\t" \
".if " #SIZE " == 16 \n\t" \
"ulw %[temp0], 8(%[top]) \n\t" \
"ulw %[temp1], 12(%[top]) \n\t" \
CLIPPING(8) \
"usw %[temp0], 8(%[dst]) \n\t" \
"usw %[temp1], 12(%[dst]) \n\t" \
".endif \n\t" \
".endif \n\t" \
: [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
: [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST)) \
: "memory" \
); \
} while (0)
#define CLIP_TO_DST(DST, SIZE) do { \
int y; \
const uint8_t* top = (DST) - BPS; \
const int top_1 = ((int)top[-1] << 16) + top[-1]; \
for (y = 0; y < (SIZE); ++y) { \
CLIP_8B_TO_DST((DST), top, (SIZE)); \
(DST) += BPS; \
} \
} while (0)
#define TRUE_MOTION(DST, SIZE) \
static void TrueMotion##SIZE(uint8_t* (DST)) { \
CLIP_TO_DST((DST), (SIZE)); \
}
TRUE_MOTION(dst, 4)
TRUE_MOTION(dst, 8)
TRUE_MOTION(dst, 16)
#undef TRUE_MOTION
#undef CLIP_TO_DST
#undef CLIP_8B_TO_DST
#undef CLIPPING
//------------------------------------------------------------------------------
// Entry point
extern void VP8DspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
VP8TransformDC = TransformDC;
VP8TransformAC3 = TransformAC3;
VP8Transform = TransformTwo;
VP8VFilter16 = VFilter16;
VP8HFilter16 = HFilter16;
VP8VFilter8 = VFilter8;
VP8HFilter8 = HFilter8;
VP8VFilter16i = VFilter16i;
VP8HFilter16i = HFilter16i;
VP8VFilter8i = VFilter8i;
VP8HFilter8i = HFilter8i;
VP8SimpleVFilter16 = SimpleVFilter16;
VP8SimpleHFilter16 = SimpleHFilter16;
VP8SimpleVFilter16i = SimpleVFilter16i;
VP8SimpleHFilter16i = SimpleHFilter16i;
VP8PredLuma4[0] = DC4;
VP8PredLuma4[1] = TrueMotion4;
VP8PredLuma4[2] = VE4;
VP8PredLuma4[4] = RD4;
VP8PredLuma4[6] = LD4;
VP8PredChroma8[0] = DC8uv;
VP8PredChroma8[1] = TrueMotion8;
VP8PredChroma8[4] = DC8uvNoTop;
VP8PredChroma8[5] = DC8uvNoLeft;
VP8PredLuma16[1] = TrueMotion16;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

1020
vendor/github.com/sizeofint/webpanimation/dsp_dec_msa.c generated vendored Normal file

File diff suppressed because it is too large Load Diff

1663
vendor/github.com/sizeofint/webpanimation/dsp_dec_neon.c generated vendored Normal file

File diff suppressed because it is too large Load Diff

1227
vendor/github.com/sizeofint/webpanimation/dsp_dec_sse2.c generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,46 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE4 version of some decoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
#include "dec_vp8i_dec.h"
#include "utils_utils.h"
static void HE16_SSE41(uint8_t* dst) { // horizontal
int j;
const __m128i kShuffle3 = _mm_set1_epi8(3);
for (j = 16; j > 0; --j) {
const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4));
const __m128i values = _mm_shuffle_epi8(in, kShuffle3);
_mm_storeu_si128((__m128i*)dst, values);
dst += BPS;
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8DspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
VP8PredLuma16[3] = HE16_SSE41;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(VP8DspInitSSE41)
#endif // WEBP_USE_SSE41

686
vendor/github.com/sizeofint/webpanimation/dsp_dsp.h generated vendored Normal file
View File

@ -0,0 +1,686 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DSP_DSP_H_
#define WEBP_DSP_DSP_H_
#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include "webp_types.h"
#ifdef __cplusplus
extern "C" {
#endif
#define BPS 32 // this is the common stride for enc/dec
//------------------------------------------------------------------------------
// CPU detection
#if defined(__GNUC__)
# define LOCAL_GCC_VERSION ((__GNUC__ << 8) | __GNUC_MINOR__)
# define LOCAL_GCC_PREREQ(maj, min) \
(LOCAL_GCC_VERSION >= (((maj) << 8) | (min)))
#else
# define LOCAL_GCC_VERSION 0
# define LOCAL_GCC_PREREQ(maj, min) 0
#endif
#if defined(__clang__)
# define LOCAL_CLANG_VERSION ((__clang_major__ << 8) | __clang_minor__)
# define LOCAL_CLANG_PREREQ(maj, min) \
(LOCAL_CLANG_VERSION >= (((maj) << 8) | (min)))
#else
# define LOCAL_CLANG_VERSION 0
# define LOCAL_CLANG_PREREQ(maj, min) 0
#endif
#ifndef __has_builtin
# define __has_builtin(x) 0
#endif
#if !defined(HAVE_CONFIG_H)
#if defined(_MSC_VER) && _MSC_VER > 1310 && \
(defined(_M_X64) || defined(_M_IX86))
#define WEBP_MSC_SSE2 // Visual C++ SSE2 targets
#endif
#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
(defined(_M_X64) || defined(_M_IX86))
#define WEBP_MSC_SSE41 // Visual C++ SSE4.1 targets
#endif
#endif
// WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
// files without intrinsics, allowing the corresponding Init() to be called.
// Files containing intrinsics will need to be built targeting the instruction
// set so should succeed on one of the earlier tests.
#if defined(__SSE2__) || defined(WEBP_MSC_SSE2) || defined(WEBP_HAVE_SSE2)
#define WEBP_USE_SSE2
#endif
#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
#define WEBP_USE_SSE41
#endif
#undef WEBP_MSC_SSE41
#undef WEBP_MSC_SSE2
// The intrinsics currently cause compiler errors with arm-nacl-gcc and the
// inline assembly would need to be modified for use with Native Client.
#if (defined(__ARM_NEON__) || \
defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
!defined(__native_client__)
#define WEBP_USE_NEON
#endif
#if !defined(WEBP_USE_NEON) && defined(__ANDROID__) && \
defined(__ARM_ARCH_7A__) && defined(HAVE_CPU_FEATURES_H)
#define WEBP_ANDROID_NEON // Android targets that may have NEON
#define WEBP_USE_NEON
#endif
#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
#define WEBP_USE_NEON
#define WEBP_USE_INTRINSICS
#endif
#if defined(__mips__) && !defined(__mips64) && \
defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
#define WEBP_USE_MIPS32
#if (__mips_isa_rev >= 2)
#define WEBP_USE_MIPS32_R2
#if defined(__mips_dspr2) || (defined(__mips_dsp_rev) && __mips_dsp_rev >= 2)
#define WEBP_USE_MIPS_DSP_R2
#endif
#endif
#endif
#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
#define WEBP_USE_MSA
#endif
#ifndef WEBP_DSP_OMIT_C_CODE
#define WEBP_DSP_OMIT_C_CODE 1
#endif
#if (defined(__aarch64__) || defined(__ARM_NEON__)) && WEBP_DSP_OMIT_C_CODE
#define WEBP_NEON_OMIT_C_CODE 1
#else
#define WEBP_NEON_OMIT_C_CODE 0
#endif
#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#define WEBP_NEON_WORK_AROUND_GCC 1
#else
#define WEBP_NEON_WORK_AROUND_GCC 0
#endif
// This macro prevents thread_sanitizer from reporting known concurrent writes.
#define WEBP_TSAN_IGNORE_FUNCTION
#if defined(__has_feature)
#if __has_feature(thread_sanitizer)
#undef WEBP_TSAN_IGNORE_FUNCTION
#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
#endif
#endif
#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
#include <pthread.h> // NOLINT
#define WEBP_DSP_INIT(func) do { \
static volatile VP8CPUInfo func ## _last_cpuinfo_used = \
(VP8CPUInfo)&func ## _last_cpuinfo_used; \
static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \
if (pthread_mutex_lock(&func ## _lock)) break; \
if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func(); \
func ## _last_cpuinfo_used = VP8GetCPUInfo; \
(void)pthread_mutex_unlock(&func ## _lock); \
} while (0)
#else // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
#define WEBP_DSP_INIT(func) do { \
static volatile VP8CPUInfo func ## _last_cpuinfo_used = \
(VP8CPUInfo)&func ## _last_cpuinfo_used; \
if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break; \
func(); \
func ## _last_cpuinfo_used = VP8GetCPUInfo; \
} while (0)
#endif // defined(WEBP_USE_THREAD) && !defined(_WIN32)
// Defines an Init + helper function that control multiple initialization of
// function pointers / tables.
/* Usage:
WEBP_DSP_INIT_FUNC(InitFunc) {
...function body
}
*/
#define WEBP_DSP_INIT_FUNC(name) \
static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \
WEBP_TSAN_IGNORE_FUNCTION void name(void) { \
WEBP_DSP_INIT(name ## _body); \
} \
static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void)
#define WEBP_UBSAN_IGNORE_UNDEF
#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
#if defined(__clang__) && defined(__has_attribute)
#if __has_attribute(no_sanitize)
// This macro prevents the undefined behavior sanitizer from reporting
// failures. This is only meant to silence unaligned loads on platforms that
// are known to support them.
#undef WEBP_UBSAN_IGNORE_UNDEF
#define WEBP_UBSAN_IGNORE_UNDEF \
__attribute__((no_sanitize("undefined")))
// This macro prevents the undefined behavior sanitizer from reporting
// failures related to unsigned integer overflows. This is only meant to
// silence cases where this well defined behavior is expected.
#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
__attribute__((no_sanitize("unsigned-integer-overflow")))
#endif
#endif
// If 'ptr' is NULL, returns NULL. Otherwise returns 'ptr + off'.
// Prevents undefined behavior sanitizer nullptr-with-nonzero-offset warning.
#if !defined(WEBP_OFFSET_PTR)
#define WEBP_OFFSET_PTR(ptr, off) (((ptr) == NULL) ? NULL : ((ptr) + (off)))
#endif
// Regularize the definition of WEBP_SWAP_16BIT_CSP (backward compatibility)
#if !defined(WEBP_SWAP_16BIT_CSP)
#define WEBP_SWAP_16BIT_CSP 0
#endif
// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
#if !defined(WORDS_BIGENDIAN) && \
(defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
(defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
#define WORDS_BIGENDIAN
#endif
typedef enum {
kSSE2,
kSSE3,
kSlowSSSE3, // special feature for slow SSSE3 architectures
kSSE4_1,
kAVX,
kAVX2,
kNEON,
kMIPS32,
kMIPSdspR2,
kMSA
} CPUFeature;
// returns true if the CPU supports the feature.
typedef int (*VP8CPUInfo)(CPUFeature feature);
WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
//------------------------------------------------------------------------------
// Init stub generator
// Defines an init function stub to ensure each module exposes a symbol,
// avoiding a compiler warning.
#define WEBP_DSP_INIT_STUB(func) \
extern void func(void); \
void func(void) {}
//------------------------------------------------------------------------------
// Encoding
// Transforms
// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms
// will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4).
typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two);
typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
extern VP8Idct VP8ITransform;
extern VP8Fdct VP8FTransform;
extern VP8Fdct VP8FTransform2; // performs two transforms at a time
extern VP8WHT VP8FTransformWHT;
// Predictions
// *dst is the destination block. *top and *left can be NULL.
typedef void (*VP8IntraPreds)(uint8_t* dst, const uint8_t* left,
const uint8_t* top);
typedef void (*VP8Intra4Preds)(uint8_t* dst, const uint8_t* top);
extern VP8Intra4Preds VP8EncPredLuma4;
extern VP8IntraPreds VP8EncPredLuma16;
extern VP8IntraPreds VP8EncPredChroma8;
typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
const uint16_t* const weights);
// The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major
// 4 by 4 symmetric matrix.
extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
// Compute the average (DC) of four 4x4 blocks.
// Each sub-4x4 block #i sum is stored in dc[i].
typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]);
extern VP8MeanMetric VP8Mean16x4;
typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
extern VP8BlockCopy VP8Copy4x4;
extern VP8BlockCopy VP8Copy16x8;
// Quantization
struct VP8Matrix; // forward declaration
typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
const struct VP8Matrix* const mtx);
// Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32],
const struct VP8Matrix* const mtx);
extern VP8QuantizeBlock VP8EncQuantizeBlock;
extern VP8Quantize2Blocks VP8EncQuantize2Blocks;
// specific to 2nd transform:
typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
const struct VP8Matrix* const mtx);
extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
extern const int VP8DspScan[16 + 4 + 4];
// Collect histogram for susceptibility calculation.
#define MAX_COEFF_THRESH 31 // size of histogram used by CollectHistogram.
typedef struct {
// We only need to store max_value and last_non_zero, not the distribution.
int max_value;
int last_non_zero;
} VP8Histogram;
typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo);
extern VP8CHisto VP8CollectHistogram;
// General-purpose util function to help VP8CollectHistogram().
void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
VP8Histogram* const histo);
// must be called before using any of the above
void VP8EncDspInit(void);
//------------------------------------------------------------------------------
// cost functions (encoding)
extern const uint16_t VP8EntropyCost[256]; // 8bit fixed-point log(p)
// approximate cost per level:
extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1];
extern const uint8_t VP8EncBands[16 + 1];
struct VP8Residual;
typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
struct VP8Residual* const res);
extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
// Cost calculation function.
typedef int (*VP8GetResidualCostFunc)(int ctx0,
const struct VP8Residual* const res);
extern VP8GetResidualCostFunc VP8GetResidualCost;
// must be called before anything using the above
void VP8EncDspCostInit(void);
//------------------------------------------------------------------------------
// SSIM / PSNR utils
// struct for accumulating statistical moments
typedef struct {
uint32_t w; // sum(w_i) : sum of weights
uint32_t xm, ym; // sum(w_i * x_i), sum(w_i * y_i)
uint32_t xxm, xym, yym; // sum(w_i * x_i * x_i), etc.
} VP8DistoStats;
// Compute the final SSIM value
// The non-clipped version assumes stats->w = (2 * VP8_SSIM_KERNEL + 1)^2.
double VP8SSIMFromStats(const VP8DistoStats* const stats);
double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats);
#define VP8_SSIM_KERNEL 3 // total size of the kernel: 2 * VP8_SSIM_KERNEL + 1
typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2,
int xo, int yo, // center position
int W, int H); // plane dimension
#if !defined(WEBP_REDUCE_SIZE)
// This version is called with the guarantee that you can load 8 bytes and
// 8 rows at offset src1 and src2
typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2);
extern VP8SSIMGetFunc VP8SSIMGet; // unclipped / unchecked
extern VP8SSIMGetClippedFunc VP8SSIMGetClipped; // with clipping
#endif
#if !defined(WEBP_DISABLE_STATS)
typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
const uint8_t* src2, int len);
extern VP8AccumulateSSEFunc VP8AccumulateSSE;
#endif
// must be called before using any of the above directly
void VP8SSIMDspInit(void);
//------------------------------------------------------------------------------
// Decoding
typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
// when doing two transforms, coeffs is actually int16_t[2][16].
typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
extern VP8DecIdct2 VP8Transform;
extern VP8DecIdct VP8TransformAC3;
extern VP8DecIdct VP8TransformUV;
extern VP8DecIdct VP8TransformDC;
extern VP8DecIdct VP8TransformDCUV;
extern VP8WHT VP8TransformWHT;
// *dst is the destination block, with stride BPS. Boundary samples are
// assumed accessible when needed.
typedef void (*VP8PredFunc)(uint8_t* dst);
extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
// clipping tables (for filtering)
extern const int8_t* const VP8ksclip1; // clips [-1020, 1020] to [-128, 127]
extern const int8_t* const VP8ksclip2; // clips [-112, 112] to [-16, 15]
extern const uint8_t* const VP8kclip1; // clips [-255,511] to [0,255]
extern const uint8_t* const VP8kabs0; // abs(x) for x in [-255,255]
// must be called first
void VP8InitClipTables(void);
// simple filter (only for luma)
typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
extern VP8SimpleFilterFunc VP8SimpleVFilter16;
extern VP8SimpleFilterFunc VP8SimpleHFilter16;
extern VP8SimpleFilterFunc VP8SimpleVFilter16i; // filter 3 inner edges
extern VP8SimpleFilterFunc VP8SimpleHFilter16i;
// regular filter (on both macroblock edges and inner edges)
typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride,
int thresh, int ithresh, int hev_t);
typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride,
int thresh, int ithresh, int hev_t);
// on outer edge
extern VP8LumaFilterFunc VP8VFilter16;
extern VP8LumaFilterFunc VP8HFilter16;
extern VP8ChromaFilterFunc VP8VFilter8;
extern VP8ChromaFilterFunc VP8HFilter8;
// on inner edge
extern VP8LumaFilterFunc VP8VFilter16i; // filtering 3 inner edges altogether
extern VP8LumaFilterFunc VP8HFilter16i;
extern VP8ChromaFilterFunc VP8VFilter8i; // filtering u and v altogether
extern VP8ChromaFilterFunc VP8HFilter8i;
// Dithering. Combines dithering values (centered around 128) with dst[],
// according to: dst[] = clip(dst[] + (((dither[]-128) + 8) >> 4)
#define VP8_DITHER_DESCALE 4
#define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
#define VP8_DITHER_AMP_BITS 7
#define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
int dst_stride);
// must be called before anything using the above
void VP8DspInit(void);
//------------------------------------------------------------------------------
// WebP I/O
#define FANCY_UPSAMPLING // undefined to remove fancy upsampling support
// Convert a pair of y/u/v lines together to the output rgb/a colorspace.
// bottom_y can be NULL if only one line of output is needed (at top/bottom).
typedef void (*WebPUpsampleLinePairFunc)(
const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* top_u, const uint8_t* top_v,
const uint8_t* cur_u, const uint8_t* cur_v,
uint8_t* top_dst, uint8_t* bottom_dst, int len);
#ifdef FANCY_UPSAMPLING
// Fancy upsampling functions to convert YUV to RGB(A) modes
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
#endif // FANCY_UPSAMPLING
// Per-row point-sampling methods.
typedef void (*WebPSamplerRowFunc)(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len);
// Generic function to apply 'WebPSamplerRowFunc' to the whole plane:
void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
const uint8_t* u, const uint8_t* v, int uv_stride,
uint8_t* dst, int dst_stride,
int width, int height, WebPSamplerRowFunc func);
// Sampling functions to convert rows of YUV to RGB(A)
extern WebPSamplerRowFunc WebPSamplers[/* MODE_LAST */];
// General function for converting two lines of ARGB or RGBA.
// 'alpha_is_last' should be true if 0xff000000 is stored in memory as
// as 0x00, 0x00, 0x00, 0xff (little endian).
WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last);
// YUV444->RGB converters
typedef void (*WebPYUV444Converter)(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len);
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
// Must be called before using the WebPUpsamplers[] (and for premultiplied
// colorspaces like rgbA, rgbA4444, etc)
void WebPInitUpsamplers(void);
// Must be called before using WebPSamplers[]
void WebPInitSamplers(void);
// Must be called before using WebPYUV444Converters[]
void WebPInitYUV444Converters(void);
//------------------------------------------------------------------------------
// ARGB -> YUV converters
// Convert ARGB samples to luma Y.
extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
// Convert ARGB samples to U/V with downsampling. do_store should be '1' for
// even lines and '0' for odd ones. 'src_width' is the original width, not
// the U/V one.
extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store);
// Convert a row of accumulated (four-values) of rgba32 toward U/V
extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
// Convert RGB or BGR to Y
extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
// used for plain-C fallback.
extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store);
extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
// utilities for accurate RGB->YUV conversion
extern uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
uint16_t* dst, int len);
extern void (*WebPSharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref,
int16_t* dst, int len);
extern void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B,
int len,
const uint16_t* best_y, uint16_t* out);
// Must be called before using the above.
void WebPInitConvertARGBToYUV(void);
//------------------------------------------------------------------------------
// Rescaler
struct WebPRescaler;
// Import a row of data and save its contribution in the rescaler.
// 'channel' denotes the channel number to be imported. 'Expand' corresponds to
// the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk,
const uint8_t* src);
extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
// Export one row (starting at x_out position) from rescaler.
// 'Expand' corresponds to the wrk->y_expand case.
// Otherwise 'Shrink' is to be used
typedef void (*WebPRescalerExportRowFunc)(struct WebPRescaler* const wrk);
extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
// Plain-C implementation, as fall-back.
extern void WebPRescalerImportRowExpand_C(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerImportRowShrink_C(struct WebPRescaler* const wrk,
const uint8_t* src);
extern void WebPRescalerExportRowExpand_C(struct WebPRescaler* const wrk);
extern void WebPRescalerExportRowShrink_C(struct WebPRescaler* const wrk);
// Main entry calls:
extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
const uint8_t* src);
// Export one row (starting at x_out position) from rescaler.
extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);
// Must be called first before using the above.
void WebPRescalerDspInit(void);
//------------------------------------------------------------------------------
// Utilities for processing transparent channel.
// Apply alpha pre-multiply on an rgba, bgra or argb plane of size w * h.
// alpha_first should be 0 for argb, 1 for rgba or bgra (where alpha is last).
extern void (*WebPApplyAlphaMultiply)(
uint8_t* rgba, int alpha_first, int w, int h, int stride);
// Same, buf specifically for RGBA4444 format
extern void (*WebPApplyAlphaMultiply4444)(
uint8_t* rgba4444, int w, int h, int stride);
// Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
// Returns true if alpha[] plane has non-trivial values different from 0xff.
extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint8_t* dst, int dst_stride);
// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
int width, int height,
uint32_t* dst, int dst_stride);
// Extract the alpha values from 32b values in argb[] and pack them into alpha[]
// (this is the opposite of WebPDispatchAlpha).
// Returns true if there's only trivial 0xff alpha values.
extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
int width, int height,
uint8_t* alpha, int alpha_stride);
// Extract the green values from 32b values in argb[] and pack them into alpha[]
// (this is the opposite of WebPDispatchAlphaToGreen).
extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
// Pre-Multiply operation transforms x into x * A / 255 (where x=Y,R,G or B).
// Un-Multiply operation transforms x into x * 255 / A.
// Pre-Multiply or Un-Multiply (if 'inverse' is true) argb values in a row.
extern void (*WebPMultARGBRow)(uint32_t* const ptr, int width, int inverse);
// Same a WebPMultARGBRow(), but for several rows.
void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
int inverse);
// Same for a row of single values, with side alpha values.
extern void (*WebPMultRow)(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse);
// Same a WebPMultRow(), but for several 'num_rows' rows.
void WebPMultRows(uint8_t* ptr, int stride,
const uint8_t* alpha, int alpha_stride,
int width, int num_rows, int inverse);
// Plain-C versions, used as fallback by some implementations.
void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
int width, int inverse);
void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
#ifdef WORDS_BIGENDIAN
// ARGB packing function: a/r/g/b input is rgba or bgra order.
extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
const uint8_t* g, const uint8_t* b, int len,
uint32_t* out);
#endif
// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
int len, int step, uint32_t* out);
// This function returns true if src[i] contains a value different from 0xff.
extern int (*WebPHasAlpha8b)(const uint8_t* src, int length);
// This function returns true if src[4*i] contains a value different from 0xff.
extern int (*WebPHasAlpha32b)(const uint8_t* src, int length);
// replaces transparent values in src[] by 'color'.
extern void (*WebPAlphaReplace)(uint32_t* src, int length, uint32_t color);
// To be called first before using the above.
void WebPInitAlphaProcessing(void);
//------------------------------------------------------------------------------
// Filter functions
typedef enum { // Filter types.
WEBP_FILTER_NONE = 0,
WEBP_FILTER_HORIZONTAL,
WEBP_FILTER_VERTICAL,
WEBP_FILTER_GRADIENT,
WEBP_FILTER_LAST = WEBP_FILTER_GRADIENT + 1, // end marker
WEBP_FILTER_BEST, // meta-types
WEBP_FILTER_FAST
} WEBP_FILTER_TYPE;
typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
int stride, uint8_t* out);
// In-place un-filtering.
// Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'.
typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds,
uint8_t* cur_line, int width);
// Filter the given data using the given predictor.
// 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
// in raster order.
// 'stride' is number of bytes per scan line (with possible padding).
// 'out' should be pre-allocated.
extern WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
// In-place reconstruct the original data from the given filtered data.
// The reconstruction will be done for 'num_rows' rows starting from 'row'
// (assuming rows upto 'row - 1' are already reconstructed).
extern WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
// To be called first before using the above.
void VP8FiltersInit(void);
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DSP_DSP_H_

830
vendor/github.com/sizeofint/webpanimation/dsp_enc.c generated vendored Normal file
View File

@ -0,0 +1,830 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Speed-critical encoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h> // for abs()
#include "dsp_dsp.h"
#include "enc_vp8i_enc.h"
static WEBP_INLINE uint8_t clip_8b(int v) {
return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
}
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int clip_max(int v, int max) {
return (v > max) ? max : v;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
const int VP8DspScan[16 + 4 + 4] = {
// Luma
0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
};
// general-purpose util function
void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
VP8Histogram* const histo) {
int max_value = 0, last_non_zero = 1;
int k;
for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
const int value = distribution[k];
if (value > 0) {
if (value > max_value) max_value = value;
last_non_zero = k;
}
}
histo->max_value = max_value;
histo->last_non_zero = last_non_zero;
}
#if !WEBP_NEON_OMIT_C_CODE
static void CollectHistogram_C(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int k;
int16_t out[16];
VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin.
for (k = 0; k < 16; ++k) {
const int v = abs(out[k]) >> 3;
const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
++distribution[clipped_value];
}
}
VP8SetHistogramData(distribution, histo);
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// run-time tables (~4k)
static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255]
// We declare this variable 'volatile' to prevent instruction reordering
// and make sure it's set to true _last_ (so as to be thread-safe)
static volatile int tables_ok = 0;
static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
if (!tables_ok) {
int i;
for (i = -255; i <= 255 + 255; ++i) {
clip1[255 + i] = clip_8b(i);
}
tables_ok = 1;
}
}
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
#if !WEBP_NEON_OMIT_C_CODE
#define STORE(x, y, v) \
dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3))
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
#define MUL(a, b) (((a) * (b)) >> 16)
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
int C[4 * 4], *tmp;
int i;
tmp = C;
for (i = 0; i < 4; ++i) { // vertical pass
const int a = in[0] + in[8];
const int b = in[0] - in[8];
const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
tmp[0] = a + d;
tmp[1] = b + c;
tmp[2] = b - c;
tmp[3] = a - d;
tmp += 4;
in++;
}
tmp = C;
for (i = 0; i < 4; ++i) { // horizontal pass
const int dc = tmp[0] + 4;
const int a = dc + tmp[8];
const int b = dc - tmp[8];
const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
STORE(0, i, a + d);
STORE(1, i, b + c);
STORE(2, i, b - c);
STORE(3, i, a - d);
tmp++;
}
}
static void ITransform_C(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform_C(const uint8_t* src, const uint8_t* ref, int16_t* out) {
int i;
int tmp[16];
for (i = 0; i < 4; ++i, src += BPS, ref += BPS) {
const int d0 = src[0] - ref[0]; // 9bit dynamic range ([-255,255])
const int d1 = src[1] - ref[1];
const int d2 = src[2] - ref[2];
const int d3 = src[3] - ref[3];
const int a0 = (d0 + d3); // 10b [-510,510]
const int a1 = (d1 + d2);
const int a2 = (d1 - d2);
const int a3 = (d0 - d3);
tmp[0 + i * 4] = (a0 + a1) * 8; // 14b [-8160,8160]
tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 1812) >> 9; // [-7536,7542]
tmp[2 + i * 4] = (a0 - a1) * 8;
tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 937) >> 9;
}
for (i = 0; i < 4; ++i) {
const int a0 = (tmp[0 + i] + tmp[12 + i]); // 15b
const int a1 = (tmp[4 + i] + tmp[ 8 + i]);
const int a2 = (tmp[4 + i] - tmp[ 8 + i]);
const int a3 = (tmp[0 + i] - tmp[12 + i]);
out[0 + i] = (a0 + a1 + 7) >> 4; // 12b
out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0);
out[8 + i] = (a0 - a1 + 7) >> 4;
out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void FTransform2_C(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
VP8FTransform(src, ref, out);
VP8FTransform(src + 4, ref + 4, out + 16);
}
#if !WEBP_NEON_OMIT_C_CODE
static void FTransformWHT_C(const int16_t* in, int16_t* out) {
// input is 12b signed
int32_t tmp[16];
int i;
for (i = 0; i < 4; ++i, in += 64) {
const int a0 = (in[0 * 16] + in[2 * 16]); // 13b
const int a1 = (in[1 * 16] + in[3 * 16]);
const int a2 = (in[1 * 16] - in[3 * 16]);
const int a3 = (in[0 * 16] - in[2 * 16]);
tmp[0 + i * 4] = a0 + a1; // 14b
tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
for (i = 0; i < 4; ++i) {
const int a0 = (tmp[0 + i] + tmp[8 + i]); // 15b
const int a1 = (tmp[4 + i] + tmp[12+ i]);
const int a2 = (tmp[4 + i] - tmp[12+ i]);
const int a3 = (tmp[0 + i] - tmp[8 + i]);
const int b0 = a0 + a1; // 16b
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
out[ 0 + i] = b0 >> 1; // 15b
out[ 4 + i] = b1 >> 1;
out[ 8 + i] = b2 >> 1;
out[12 + i] = b3 >> 1;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MUL
#undef STORE
//------------------------------------------------------------------------------
// Intra predictions
static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
int j;
for (j = 0; j < size; ++j) {
memset(dst + j * BPS, value, size);
}
}
static WEBP_INLINE void VerticalPred(uint8_t* dst,
const uint8_t* top, int size) {
int j;
if (top != NULL) {
for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
} else {
Fill(dst, 127, size);
}
}
static WEBP_INLINE void HorizontalPred(uint8_t* dst,
const uint8_t* left, int size) {
if (left != NULL) {
int j;
for (j = 0; j < size; ++j) {
memset(dst + j * BPS, left[j], size);
}
} else {
Fill(dst, 129, size);
}
}
static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
const uint8_t* top, int size) {
int y;
if (left != NULL) {
if (top != NULL) {
const uint8_t* const clip = clip1 + 255 - left[-1];
for (y = 0; y < size; ++y) {
const uint8_t* const clip_table = clip + left[y];
int x;
for (x = 0; x < size; ++x) {
dst[x] = clip_table[top[x]];
}
dst += BPS;
}
} else {
HorizontalPred(dst, left, size);
}
} else {
// true motion without left samples (hence: with default 129 value)
// is equivalent to VE prediction where you just copy the top samples.
// Note that if top samples are not available, the default value is
// then 129, and not 127 as in the VerticalPred case.
if (top != NULL) {
VerticalPred(dst, top, size);
} else {
Fill(dst, 129, size);
}
}
}
static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
const uint8_t* top,
int size, int round, int shift) {
int DC = 0;
int j;
if (top != NULL) {
for (j = 0; j < size; ++j) DC += top[j];
if (left != NULL) { // top and left present
for (j = 0; j < size; ++j) DC += left[j];
} else { // top, but no left
DC += DC;
}
DC = (DC + round) >> shift;
} else if (left != NULL) { // left but no top
for (j = 0; j < size; ++j) DC += left[j];
DC += DC;
DC = (DC + round) >> shift;
} else { // no top, no left, nothing.
DC = 0x80;
}
Fill(dst, DC, size);
}
//------------------------------------------------------------------------------
// Chroma 8x8 prediction (paragraph 12.2)
static void IntraChromaPreds_C(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
// V block
dst += 8;
if (top != NULL) top += 8;
if (left != NULL) left += 16;
DCMode(C8DC8 + dst, left, top, 8, 8, 4);
VerticalPred(C8VE8 + dst, top, 8);
HorizontalPred(C8HE8 + dst, left, 8);
TrueMotion(C8TM8 + dst, left, top, 8);
}
//------------------------------------------------------------------------------
// luma 16x16 prediction (paragraph 12.3)
static void Intra16Preds_C(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode(I16DC16 + dst, left, top, 16, 16, 5);
VerticalPred(I16VE16 + dst, top, 16);
HorizontalPred(I16HE16 + dst, left, 16);
TrueMotion(I16TM16 + dst, left, top, 16);
}
//------------------------------------------------------------------------------
// luma 4x4 prediction
#define DST(x, y) dst[(x) + (y) * BPS]
#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static void VE4(uint8_t* dst, const uint8_t* top) { // vertical
const uint8_t vals[4] = {
AVG3(top[-1], top[0], top[1]),
AVG3(top[ 0], top[1], top[2]),
AVG3(top[ 1], top[2], top[3]),
AVG3(top[ 2], top[3], top[4])
};
int i;
for (i = 0; i < 4; ++i) {
memcpy(dst + i * BPS, vals, 4);
}
}
static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
static void DC4(uint8_t* dst, const uint8_t* top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
Fill(dst, dc >> 3, 4);
}
static void RD4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
DST(0, 3) = AVG3(J, K, L);
DST(0, 2) = DST(1, 3) = AVG3(I, J, K);
DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J);
DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X);
DST(2, 0) = DST(3, 1) = AVG3(C, B, A);
DST(3, 0) = AVG3(D, C, B);
}
static void LD4(uint8_t* dst, const uint8_t* top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
const int E = top[4];
const int F = top[5];
const int G = top[6];
const int H = top[7];
DST(0, 0) = AVG3(A, B, C);
DST(1, 0) = DST(0, 1) = AVG3(B, C, D);
DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E);
DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
DST(3, 3) = AVG3(G, H, H);
}
static void VR4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
DST(0, 0) = DST(1, 2) = AVG2(X, A);
DST(1, 0) = DST(2, 2) = AVG2(A, B);
DST(2, 0) = DST(3, 2) = AVG2(B, C);
DST(3, 0) = AVG2(C, D);
DST(0, 3) = AVG3(K, J, I);
DST(0, 2) = AVG3(J, I, X);
DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
DST(3, 1) = AVG3(B, C, D);
}
static void VL4(uint8_t* dst, const uint8_t* top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
const int E = top[4];
const int F = top[5];
const int G = top[6];
const int H = top[7];
DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
DST(3, 2) = AVG3(E, F, G);
DST(3, 3) = AVG3(F, G, H);
}
static void HU4(uint8_t* dst, const uint8_t* top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
DST(0, 0) = AVG2(I, J);
DST(2, 0) = DST(0, 1) = AVG2(J, K);
DST(2, 1) = DST(0, 2) = AVG2(K, L);
DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
DST(3, 2) = DST(2, 2) =
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static void HD4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
const int A = top[0];
const int B = top[1];
const int C = top[2];
DST(0, 0) = DST(2, 1) = AVG2(I, X);
DST(0, 1) = DST(2, 2) = AVG2(J, I);
DST(0, 2) = DST(2, 3) = AVG2(K, J);
DST(0, 3) = AVG2(L, K);
DST(3, 0) = AVG3(A, B, C);
DST(2, 0) = AVG3(X, A, B);
DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
DST(1, 3) = AVG3(L, K, J);
}
static void TM4(uint8_t* dst, const uint8_t* top) {
int x, y;
const uint8_t* const clip = clip1 + 255 - top[-1];
for (y = 0; y < 4; ++y) {
const uint8_t* const clip_table = clip + top[-2 - y];
for (x = 0; x < 4; ++x) {
dst[x] = clip_table[top[x]];
}
dst += BPS;
}
}
#undef DST
#undef AVG3
#undef AVG2
// Left samples are top[-5 .. -2], top_left is top[-1], top are
// located at top[0..3], and top right is top[4..7]
static void Intra4Preds_C(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
HE4(I4HE4 + dst, top);
RD4(I4RD4 + dst, top);
VR4(I4VR4 + dst, top);
LD4(I4LD4 + dst, top);
VL4(I4VL4 + dst, top);
HD4(I4HD4 + dst, top);
HU4(I4HU4 + dst, top);
}
//------------------------------------------------------------------------------
// Metric
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE int GetSSE(const uint8_t* a, const uint8_t* b,
int w, int h) {
int count = 0;
int y, x;
for (y = 0; y < h; ++y) {
for (x = 0; x < w; ++x) {
const int diff = (int)a[x] - b[x];
count += diff * diff;
}
a += BPS;
b += BPS;
}
return count;
}
static int SSE16x16_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 16);
}
static int SSE16x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 16, 8);
}
static int SSE8x8_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 8, 8);
}
static int SSE4x4_C(const uint8_t* a, const uint8_t* b) {
return GetSSE(a, b, 4, 4);
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void Mean16x4_C(const uint8_t* ref, uint32_t dc[4]) {
int k, x, y;
for (k = 0; k < 4; ++k) {
uint32_t avg = 0;
for (y = 0; y < 4; ++y) {
for (x = 0; x < 4; ++x) {
avg += ref[x + y * BPS];
}
}
dc[k] = avg;
ref += 4; // go to next 4x4 block.
}
}
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
#if !WEBP_NEON_OMIT_C_CODE
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform(const uint8_t* in, const uint16_t* w) {
int sum = 0;
int tmp[16];
int i;
// horizontal pass
for (i = 0; i < 4; ++i, in += BPS) {
const int a0 = in[0] + in[2];
const int a1 = in[1] + in[3];
const int a2 = in[1] - in[3];
const int a3 = in[0] - in[2];
tmp[0 + i * 4] = a0 + a1;
tmp[1 + i * 4] = a3 + a2;
tmp[2 + i * 4] = a3 - a2;
tmp[3 + i * 4] = a0 - a1;
}
// vertical pass
for (i = 0; i < 4; ++i, ++w) {
const int a0 = tmp[0 + i] + tmp[8 + i];
const int a1 = tmp[4 + i] + tmp[12+ i];
const int a2 = tmp[4 + i] - tmp[12+ i];
const int a3 = tmp[0 + i] - tmp[8 + i];
const int b0 = a0 + a1;
const int b1 = a3 + a2;
const int b2 = a3 - a2;
const int b3 = a0 - a1;
sum += w[ 0] * abs(b0);
sum += w[ 4] * abs(b1);
sum += w[ 8] * abs(b2);
sum += w[12] * abs(b3);
}
return sum;
}
static int Disto4x4_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform(a, w);
const int sum2 = TTransform(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16_C(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_C(a + x + y, b + x + y, w);
}
}
return D;
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Quantization
//
static const uint8_t kZigzag[16] = {
0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
};
// Simple quantization
static int QuantizeBlock_C(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int last = -1;
int n;
for (n = 0; n < 16; ++n) {
const int j = kZigzag[n];
const int sign = (in[j] < 0);
const uint32_t coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
if (coeff > mtx->zthresh_[j]) {
const uint32_t Q = mtx->q_[j];
const uint32_t iQ = mtx->iq_[j];
const uint32_t B = mtx->bias_[j];
int level = QUANTDIV(coeff, iQ, B);
if (level > MAX_LEVEL) level = MAX_LEVEL;
if (sign) level = -level;
in[j] = level * (int)Q;
out[n] = level;
if (level) last = n;
} else {
out[n] = 0;
in[j] = 0;
}
}
return (last >= 0);
}
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
static int Quantize2Blocks_C(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#endif // !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Block copy
static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
int y;
for (y = 0; y < h; ++y) {
memcpy(dst, src, w);
src += BPS;
dst += BPS;
}
}
static void Copy4x4_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 4, 4);
}
static void Copy16x8_C(const uint8_t* src, uint8_t* dst) {
Copy(src, dst, 16, 8);
}
//------------------------------------------------------------------------------
// Initialization
// Speed-critical function pointers. We have to initialize them to the default
// implementations within VP8EncDspInit().
VP8CHisto VP8CollectHistogram;
VP8Idct VP8ITransform;
VP8Fdct VP8FTransform;
VP8Fdct VP8FTransform2;
VP8WHT VP8FTransformWHT;
VP8Intra4Preds VP8EncPredLuma4;
VP8IntraPreds VP8EncPredLuma16;
VP8IntraPreds VP8EncPredChroma8;
VP8Metric VP8SSE16x16;
VP8Metric VP8SSE8x8;
VP8Metric VP8SSE16x8;
VP8Metric VP8SSE4x4;
VP8WMetric VP8TDisto4x4;
VP8WMetric VP8TDisto16x16;
VP8MeanMetric VP8Mean16x4;
VP8QuantizeBlock VP8EncQuantizeBlock;
VP8Quantize2Blocks VP8EncQuantize2Blocks;
VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
VP8BlockCopy VP8Copy4x4;
VP8BlockCopy VP8Copy16x8;
extern void VP8EncDspInitSSE2(void);
extern void VP8EncDspInitSSE41(void);
extern void VP8EncDspInitNEON(void);
extern void VP8EncDspInitMIPS32(void);
extern void VP8EncDspInitMIPSdspR2(void);
extern void VP8EncDspInitMSA(void);
WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
VP8DspInit(); // common inverse transforms
InitTables();
// default C implementations
#if !WEBP_NEON_OMIT_C_CODE
VP8ITransform = ITransform_C;
VP8FTransform = FTransform_C;
VP8FTransformWHT = FTransformWHT_C;
VP8TDisto4x4 = Disto4x4_C;
VP8TDisto16x16 = Disto16x16_C;
VP8CollectHistogram = CollectHistogram_C;
VP8SSE16x16 = SSE16x16_C;
VP8SSE16x8 = SSE16x8_C;
VP8SSE8x8 = SSE8x8_C;
VP8SSE4x4 = SSE4x4_C;
#endif
#if !WEBP_NEON_OMIT_C_CODE || WEBP_NEON_WORK_AROUND_GCC
VP8EncQuantizeBlock = QuantizeBlock_C;
VP8EncQuantize2Blocks = Quantize2Blocks_C;
#endif
VP8FTransform2 = FTransform2_C;
VP8EncPredLuma4 = Intra4Preds_C;
VP8EncPredLuma16 = Intra16Preds_C;
VP8EncPredChroma8 = IntraChromaPreds_C;
VP8Mean16x4 = Mean16x4_C;
VP8EncQuantizeBlockWHT = QuantizeBlock_C;
VP8Copy4x4 = Copy4x4_C;
VP8Copy16x8 = Copy16x8_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8EncDspInitSSE2();
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8EncDspInitSSE41();
}
#endif
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8EncDspInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8EncDspInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_MSA)
if (VP8GetCPUInfo(kMSA)) {
VP8EncDspInitMSA();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8EncDspInitNEON();
}
#endif
assert(VP8ITransform != NULL);
assert(VP8FTransform != NULL);
assert(VP8FTransformWHT != NULL);
assert(VP8TDisto4x4 != NULL);
assert(VP8TDisto16x16 != NULL);
assert(VP8CollectHistogram != NULL);
assert(VP8SSE16x16 != NULL);
assert(VP8SSE16x8 != NULL);
assert(VP8SSE8x8 != NULL);
assert(VP8SSE4x4 != NULL);
assert(VP8EncQuantizeBlock != NULL);
assert(VP8EncQuantize2Blocks != NULL);
assert(VP8FTransform2 != NULL);
assert(VP8EncPredLuma4 != NULL);
assert(VP8EncPredLuma16 != NULL);
assert(VP8EncPredChroma8 != NULL);
assert(VP8Mean16x4 != NULL);
assert(VP8EncQuantizeBlockWHT != NULL);
assert(VP8Copy4x4 != NULL);
assert(VP8Copy16x8 != NULL);
}

View File

@ -0,0 +1,677 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of speed-critical encoding functions.
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
// Slobodan Prijic (slobodan.prijic@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "dsp_mips_macro.h"
#include "enc_vp8i_enc.h"
#include "enc_cost_enc.h"
static const int kC1 = 20091 + (1 << 16);
static const int kC2 = 35468;
// macro for one vertical pass in ITransformOne
// MUL macro inlined
// temp0..temp15 holds tmp[0]..tmp[15]
// A..D - offsets in bytes to load from in buffer
// TEMP0..TEMP3 - registers for corresponding tmp elements
// TEMP4..TEMP5 - temporary registers
#define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3) \
"lh %[temp16], " #A "(%[temp20]) \n\t" \
"lh %[temp18], " #B "(%[temp20]) \n\t" \
"lh %[temp17], " #C "(%[temp20]) \n\t" \
"lh %[temp19], " #D "(%[temp20]) \n\t" \
"addu %[" #TEMP4 "], %[temp16], %[temp18] \n\t" \
"subu %[temp16], %[temp16], %[temp18] \n\t" \
"mul %[" #TEMP0 "], %[temp17], %[kC2] \n\t" \
"mul %[temp18], %[temp19], %[kC1] \n\t" \
"mul %[temp17], %[temp17], %[kC1] \n\t" \
"mul %[temp19], %[temp19], %[kC2] \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\n" \
"sra %[temp18], %[temp18], 16 \n\n" \
"sra %[temp17], %[temp17], 16 \n\n" \
"sra %[temp19], %[temp19], 16 \n\n" \
"subu %[" #TEMP2 "], %[" #TEMP0 "], %[temp18] \n\t" \
"addu %[" #TEMP3 "], %[temp17], %[temp19] \n\t" \
"addu %[" #TEMP0 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t" \
"addu %[" #TEMP1 "], %[temp16], %[" #TEMP2 "] \n\t" \
"subu %[" #TEMP2 "], %[temp16], %[" #TEMP2 "] \n\t" \
"subu %[" #TEMP3 "], %[" #TEMP4 "], %[" #TEMP3 "] \n\t"
// macro for one horizontal pass in ITransformOne
// MUL and STORE macros inlined
// a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
// temp0..temp15 holds tmp[0]..tmp[15]
// A - offset in bytes to load from ref and store to dst buffer
// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12) \
"addiu %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
"addu %[temp16], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
"subu %[temp17], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
"mul %[" #TEMP0 "], %[" #TEMP4 "], %[kC2] \n\t" \
"mul %[" #TEMP8 "], %[" #TEMP12 "], %[kC1] \n\t" \
"mul %[" #TEMP4 "], %[" #TEMP4 "], %[kC1] \n\t" \
"mul %[" #TEMP12 "], %[" #TEMP12 "], %[kC2] \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 16 \n\t" \
"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
"sra %[" #TEMP4 "], %[" #TEMP4 "], 16 \n\t" \
"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
"subu %[temp18], %[" #TEMP0 "], %[" #TEMP8 "] \n\t" \
"addu %[temp19], %[" #TEMP4 "], %[" #TEMP12 "] \n\t" \
"addu %[" #TEMP0 "], %[temp16], %[temp19] \n\t" \
"addu %[" #TEMP4 "], %[temp17], %[temp18] \n\t" \
"subu %[" #TEMP8 "], %[temp17], %[temp18] \n\t" \
"subu %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
"lw %[temp20], 0(%[args]) \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 3 \n\t" \
"sra %[" #TEMP4 "], %[" #TEMP4 "], 3 \n\t" \
"sra %[" #TEMP8 "], %[" #TEMP8 "], 3 \n\t" \
"sra %[" #TEMP12 "], %[" #TEMP12 "], 3 \n\t" \
"lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"lbu %[temp17], 1+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"lbu %[temp18], 2+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[temp20]) \n\t" \
"addu %[" #TEMP0 "], %[temp16], %[" #TEMP0 "] \n\t" \
"addu %[" #TEMP4 "], %[temp17], %[" #TEMP4 "] \n\t" \
"addu %[" #TEMP8 "], %[temp18], %[" #TEMP8 "] \n\t" \
"addu %[" #TEMP12 "], %[temp19], %[" #TEMP12 "] \n\t" \
"slt %[temp16], %[" #TEMP0 "], $zero \n\t" \
"slt %[temp17], %[" #TEMP4 "], $zero \n\t" \
"slt %[temp18], %[" #TEMP8 "], $zero \n\t" \
"slt %[temp19], %[" #TEMP12 "], $zero \n\t" \
"movn %[" #TEMP0 "], $zero, %[temp16] \n\t" \
"movn %[" #TEMP4 "], $zero, %[temp17] \n\t" \
"movn %[" #TEMP8 "], $zero, %[temp18] \n\t" \
"movn %[" #TEMP12 "], $zero, %[temp19] \n\t" \
"addiu %[temp20], $zero, 255 \n\t" \
"slt %[temp16], %[" #TEMP0 "], %[temp20] \n\t" \
"slt %[temp17], %[" #TEMP4 "], %[temp20] \n\t" \
"slt %[temp18], %[" #TEMP8 "], %[temp20] \n\t" \
"slt %[temp19], %[" #TEMP12 "], %[temp20] \n\t" \
"movz %[" #TEMP0 "], %[temp20], %[temp16] \n\t" \
"movz %[" #TEMP4 "], %[temp20], %[temp17] \n\t" \
"lw %[temp16], 8(%[args]) \n\t" \
"movz %[" #TEMP8 "], %[temp20], %[temp18] \n\t" \
"movz %[" #TEMP12 "], %[temp20], %[temp19] \n\t" \
"sb %[" #TEMP0 "], 0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
"sb %[" #TEMP4 "], 1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
"sb %[" #TEMP8 "], 2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" \
"sb %[" #TEMP12 "], 3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
// Does one or two inverse transforms.
static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref,
const int16_t* in,
uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
int temp7, temp8, temp9, temp10, temp11, temp12, temp13;
int temp14, temp15, temp16, temp17, temp18, temp19, temp20;
const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst};
__asm__ volatile(
"lw %[temp20], 4(%[args]) \n\t"
VERTICAL_PASS(0, 16, 8, 24, temp4, temp0, temp1, temp2, temp3)
VERTICAL_PASS(2, 18, 10, 26, temp8, temp4, temp5, temp6, temp7)
VERTICAL_PASS(4, 20, 12, 28, temp12, temp8, temp9, temp10, temp11)
VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
HORIZONTAL_PASS(0, temp0, temp4, temp8, temp12)
HORIZONTAL_PASS(1, temp1, temp5, temp9, temp13)
HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
[temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
: [args]"r"(args), [kC1]"r"(kC1), [kC2]"r"(kC2)
: "memory", "hi", "lo"
);
}
static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in,
uint8_t* dst, int do_two) {
ITransformOne_MIPS32(ref, in, dst);
if (do_two) {
ITransformOne_MIPS32(ref + 4, in + 16, dst + 4);
}
}
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
// macro for one pass through for loop in QuantizeBlock
// QUANTDIV macro inlined
// J - offset in bytes (kZigzag[n] * 2)
// K - offset in bytes (kZigzag[n] * 4)
// N - offset in bytes (n * 2)
#define QUANTIZE_ONE(J, K, N) \
"lh %[temp0], " #J "(%[ppin]) \n\t" \
"lhu %[temp1], " #J "(%[ppsharpen]) \n\t" \
"lw %[temp2], " #K "(%[ppzthresh]) \n\t" \
"sra %[sign], %[temp0], 15 \n\t" \
"xor %[coeff], %[temp0], %[sign] \n\t" \
"subu %[coeff], %[coeff], %[sign] \n\t" \
"addu %[coeff], %[coeff], %[temp1] \n\t" \
"slt %[temp4], %[temp2], %[coeff] \n\t" \
"addiu %[temp5], $zero, 0 \n\t" \
"addiu %[level], $zero, 0 \n\t" \
"beqz %[temp4], 2f \n\t" \
"lhu %[temp1], " #J "(%[ppiq]) \n\t" \
"lw %[temp2], " #K "(%[ppbias]) \n\t" \
"lhu %[temp3], " #J "(%[ppq]) \n\t" \
"mul %[level], %[coeff], %[temp1] \n\t" \
"addu %[level], %[level], %[temp2] \n\t" \
"sra %[level], %[level], 17 \n\t" \
"slt %[temp4], %[max_level], %[level] \n\t" \
"movn %[level], %[max_level], %[temp4] \n\t" \
"xor %[level], %[level], %[sign] \n\t" \
"subu %[level], %[level], %[sign] \n\t" \
"mul %[temp5], %[level], %[temp3] \n\t" \
"2: \n\t" \
"sh %[temp5], " #J "(%[ppin]) \n\t" \
"sh %[level], " #N "(%[pout]) \n\t"
static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int temp0, temp1, temp2, temp3, temp4, temp5;
int sign, coeff, level, i;
int max_level = MAX_LEVEL;
int16_t* ppin = &in[0];
int16_t* pout = &out[0];
const uint16_t* ppsharpen = &mtx->sharpen_[0];
const uint32_t* ppzthresh = &mtx->zthresh_[0];
const uint16_t* ppq = &mtx->q_[0];
const uint16_t* ppiq = &mtx->iq_[0];
const uint32_t* ppbias = &mtx->bias_[0];
__asm__ volatile(
QUANTIZE_ONE( 0, 0, 0)
QUANTIZE_ONE( 2, 4, 2)
QUANTIZE_ONE( 8, 16, 4)
QUANTIZE_ONE(16, 32, 6)
QUANTIZE_ONE(10, 20, 8)
QUANTIZE_ONE( 4, 8, 10)
QUANTIZE_ONE( 6, 12, 12)
QUANTIZE_ONE(12, 24, 14)
QUANTIZE_ONE(18, 36, 16)
QUANTIZE_ONE(24, 48, 18)
QUANTIZE_ONE(26, 52, 20)
QUANTIZE_ONE(20, 40, 22)
QUANTIZE_ONE(14, 28, 24)
QUANTIZE_ONE(22, 44, 26)
QUANTIZE_ONE(28, 56, 28)
QUANTIZE_ONE(30, 60, 30)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[sign]"=&r"(sign), [coeff]"=&r"(coeff),
[level]"=&r"(level)
: [pout]"r"(pout), [ppin]"r"(ppin),
[ppiq]"r"(ppiq), [max_level]"r"(max_level),
[ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
[ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
: "memory", "hi", "lo"
);
// moved out from macro to increase possibility for earlier breaking
for (i = 15; i >= 0; i--) {
if (out[i]) return 1;
}
return 0;
}
static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#undef QUANTIZE_ONE
// macro for one horizontal pass in Disto4x4 (TTransform)
// two calls of function TTransform are merged into single one
// A - offset in bytes to load from a and b buffers
// E..H - offsets in bytes to store first results to tmp buffer
// E1..H1 - offsets in bytes to store second results to tmp buffer
#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1) \
"lbu %[temp0], 0+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp1], 1+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp2], 2+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp3], 3+" XSTR(BPS) "*" #A "(%[a]) \n\t" \
"lbu %[temp4], 0+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"lbu %[temp5], 1+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"lbu %[temp6], 2+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"lbu %[temp7], 3+" XSTR(BPS) "*" #A "(%[b]) \n\t" \
"addu %[temp8], %[temp0], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp2] \n\t" \
"addu %[temp2], %[temp1], %[temp3] \n\t" \
"subu %[temp1], %[temp1], %[temp3] \n\t" \
"addu %[temp3], %[temp4], %[temp6] \n\t" \
"subu %[temp4], %[temp4], %[temp6] \n\t" \
"addu %[temp6], %[temp5], %[temp7] \n\t" \
"subu %[temp5], %[temp5], %[temp7] \n\t" \
"addu %[temp7], %[temp8], %[temp2] \n\t" \
"subu %[temp2], %[temp8], %[temp2] \n\t" \
"addu %[temp8], %[temp0], %[temp1] \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"addu %[temp1], %[temp3], %[temp6] \n\t" \
"subu %[temp3], %[temp3], %[temp6] \n\t" \
"addu %[temp6], %[temp4], %[temp5] \n\t" \
"subu %[temp4], %[temp4], %[temp5] \n\t" \
"sw %[temp7], " #E "(%[tmp]) \n\t" \
"sw %[temp2], " #H "(%[tmp]) \n\t" \
"sw %[temp8], " #F "(%[tmp]) \n\t" \
"sw %[temp0], " #G "(%[tmp]) \n\t" \
"sw %[temp1], " #E1 "(%[tmp]) \n\t" \
"sw %[temp3], " #H1 "(%[tmp]) \n\t" \
"sw %[temp6], " #F1 "(%[tmp]) \n\t" \
"sw %[temp4], " #G1 "(%[tmp]) \n\t"
// macro for one vertical pass in Disto4x4 (TTransform)
// two calls of function TTransform are merged into single one
// since only one accu is available in mips32r1 instruction set
// first is done second call of function TTransform and after
// that first one.
// const int sum1 = TTransform(a, w);
// const int sum2 = TTransform(b, w);
// return abs(sum2 - sum1) >> 5;
// (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1)
// A..D - offsets in bytes to load first results from tmp buffer
// A1..D1 - offsets in bytes to load second results from tmp buffer
// E..H - offsets in bytes to load from w buffer
#define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H) \
"lw %[temp0], " #A1 "(%[tmp]) \n\t" \
"lw %[temp1], " #C1 "(%[tmp]) \n\t" \
"lw %[temp2], " #B1 "(%[tmp]) \n\t" \
"lw %[temp3], " #D1 "(%[tmp]) \n\t" \
"addu %[temp8], %[temp0], %[temp1] \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"addu %[temp1], %[temp2], %[temp3] \n\t" \
"subu %[temp2], %[temp2], %[temp3] \n\t" \
"addu %[temp3], %[temp8], %[temp1] \n\t" \
"subu %[temp8], %[temp8], %[temp1] \n\t" \
"addu %[temp1], %[temp0], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp2] \n\t" \
"sra %[temp4], %[temp3], 31 \n\t" \
"sra %[temp5], %[temp1], 31 \n\t" \
"sra %[temp6], %[temp0], 31 \n\t" \
"sra %[temp7], %[temp8], 31 \n\t" \
"xor %[temp3], %[temp3], %[temp4] \n\t" \
"xor %[temp1], %[temp1], %[temp5] \n\t" \
"xor %[temp0], %[temp0], %[temp6] \n\t" \
"xor %[temp8], %[temp8], %[temp7] \n\t" \
"subu %[temp3], %[temp3], %[temp4] \n\t" \
"subu %[temp1], %[temp1], %[temp5] \n\t" \
"subu %[temp0], %[temp0], %[temp6] \n\t" \
"subu %[temp8], %[temp8], %[temp7] \n\t" \
"lhu %[temp4], " #E "(%[w]) \n\t" \
"lhu %[temp5], " #F "(%[w]) \n\t" \
"lhu %[temp6], " #G "(%[w]) \n\t" \
"lhu %[temp7], " #H "(%[w]) \n\t" \
"madd %[temp4], %[temp3] \n\t" \
"madd %[temp5], %[temp1] \n\t" \
"madd %[temp6], %[temp0] \n\t" \
"madd %[temp7], %[temp8] \n\t" \
"lw %[temp0], " #A "(%[tmp]) \n\t" \
"lw %[temp1], " #C "(%[tmp]) \n\t" \
"lw %[temp2], " #B "(%[tmp]) \n\t" \
"lw %[temp3], " #D "(%[tmp]) \n\t" \
"addu %[temp8], %[temp0], %[temp1] \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"addu %[temp1], %[temp2], %[temp3] \n\t" \
"subu %[temp2], %[temp2], %[temp3] \n\t" \
"addu %[temp3], %[temp8], %[temp1] \n\t" \
"subu %[temp1], %[temp8], %[temp1] \n\t" \
"addu %[temp8], %[temp0], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp2] \n\t" \
"sra %[temp2], %[temp3], 31 \n\t" \
"xor %[temp3], %[temp3], %[temp2] \n\t" \
"subu %[temp3], %[temp3], %[temp2] \n\t" \
"msub %[temp4], %[temp3] \n\t" \
"sra %[temp2], %[temp8], 31 \n\t" \
"sra %[temp3], %[temp0], 31 \n\t" \
"sra %[temp4], %[temp1], 31 \n\t" \
"xor %[temp8], %[temp8], %[temp2] \n\t" \
"xor %[temp0], %[temp0], %[temp3] \n\t" \
"xor %[temp1], %[temp1], %[temp4] \n\t" \
"subu %[temp8], %[temp8], %[temp2] \n\t" \
"subu %[temp0], %[temp0], %[temp3] \n\t" \
"subu %[temp1], %[temp1], %[temp4] \n\t" \
"msub %[temp5], %[temp8] \n\t" \
"msub %[temp6], %[temp0] \n\t" \
"msub %[temp7], %[temp1] \n\t"
static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int tmp[32];
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
__asm__ volatile(
HORIZONTAL_PASS(0, 0, 4, 8, 12, 64, 68, 72, 76)
HORIZONTAL_PASS(1, 16, 20, 24, 28, 80, 84, 88, 92)
HORIZONTAL_PASS(2, 32, 36, 40, 44, 96, 100, 104, 108)
HORIZONTAL_PASS(3, 48, 52, 56, 60, 112, 116, 120, 124)
"mthi $zero \n\t"
"mtlo $zero \n\t"
VERTICAL_PASS( 0, 16, 32, 48, 64, 80, 96, 112, 0, 8, 16, 24)
VERTICAL_PASS( 4, 20, 36, 52, 68, 84, 100, 116, 2, 10, 18, 26)
VERTICAL_PASS( 8, 24, 40, 56, 72, 88, 104, 120, 4, 12, 20, 28)
VERTICAL_PASS(12, 28, 44, 60, 76, 92, 108, 124, 6, 14, 22, 30)
"mflo %[temp0] \n\t"
"sra %[temp1], %[temp0], 31 \n\t"
"xor %[temp0], %[temp0], %[temp1] \n\t"
"subu %[temp0], %[temp0], %[temp1] \n\t"
"sra %[temp0], %[temp0], 5 \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
: [a]"r"(a), [b]"r"(b), [w]"r"(w), [tmp]"r"(tmp)
: "memory", "hi", "lo"
);
return temp0;
}
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_MIPS32(a + x + y, b + x + y, w);
}
}
return D;
}
// macro for one horizontal pass in FTransform
// temp0..temp15 holds tmp[0]..tmp[15]
// A - offset in bytes to load from src and ref buffers
// TEMP0..TEMP3 - registers for corresponding tmp elements
#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3) \
"lw %[" #TEMP1 "], 0(%[args]) \n\t" \
"lw %[" #TEMP2 "], 4(%[args]) \n\t" \
"lbu %[temp16], 0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp17], 0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"lbu %[temp18], 1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp19], 1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"subu %[temp20], %[temp16], %[temp17] \n\t" \
"lbu %[temp16], 2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp17], 2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"subu %[" #TEMP0 "], %[temp18], %[temp19] \n\t" \
"lbu %[temp18], 3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t" \
"lbu %[temp19], 3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "]) \n\t" \
"subu %[" #TEMP1 "], %[temp16], %[temp17] \n\t" \
"subu %[" #TEMP2 "], %[temp18], %[temp19] \n\t" \
"addu %[" #TEMP3 "], %[temp20], %[" #TEMP2 "] \n\t" \
"subu %[" #TEMP2 "], %[temp20], %[" #TEMP2 "] \n\t" \
"addu %[temp20], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
"subu %[" #TEMP0 "], %[" #TEMP0 "], %[" #TEMP1 "] \n\t" \
"mul %[temp16], %[" #TEMP2 "], %[c5352] \n\t" \
"mul %[temp17], %[" #TEMP2 "], %[c2217] \n\t" \
"mul %[temp18], %[" #TEMP0 "], %[c5352] \n\t" \
"mul %[temp19], %[" #TEMP0 "], %[c2217] \n\t" \
"addu %[" #TEMP1 "], %[" #TEMP3 "], %[temp20] \n\t" \
"subu %[temp20], %[" #TEMP3 "], %[temp20] \n\t" \
"sll %[" #TEMP0 "], %[" #TEMP1 "], 3 \n\t" \
"sll %[" #TEMP2 "], %[temp20], 3 \n\t" \
"addiu %[temp16], %[temp16], 1812 \n\t" \
"addiu %[temp17], %[temp17], 937 \n\t" \
"addu %[temp16], %[temp16], %[temp19] \n\t" \
"subu %[temp17], %[temp17], %[temp18] \n\t" \
"sra %[" #TEMP1 "], %[temp16], 9 \n\t" \
"sra %[" #TEMP3 "], %[temp17], 9 \n\t"
// macro for one vertical pass in FTransform
// temp0..temp15 holds tmp[0]..tmp[15]
// A..D - offsets in bytes to store to out buffer
// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12) \
"addu %[temp16], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
"subu %[temp19], %[" #TEMP0 "], %[" #TEMP12 "] \n\t" \
"addu %[temp17], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
"subu %[temp18], %[" #TEMP4 "], %[" #TEMP8 "] \n\t" \
"mul %[" #TEMP8 "], %[temp19], %[c2217] \n\t" \
"mul %[" #TEMP12 "], %[temp18], %[c2217] \n\t" \
"mul %[" #TEMP4 "], %[temp19], %[c5352] \n\t" \
"mul %[temp18], %[temp18], %[c5352] \n\t" \
"addiu %[temp16], %[temp16], 7 \n\t" \
"addu %[" #TEMP0 "], %[temp16], %[temp17] \n\t" \
"sra %[" #TEMP0 "], %[" #TEMP0 "], 4 \n\t" \
"addu %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "] \n\t" \
"subu %[" #TEMP4 "], %[temp16], %[temp17] \n\t" \
"sra %[" #TEMP4 "], %[" #TEMP4 "], 4 \n\t" \
"addiu %[" #TEMP8 "], %[" #TEMP8 "], 30000 \n\t" \
"addiu %[" #TEMP12 "], %[" #TEMP12 "], 12000 \n\t" \
"addiu %[" #TEMP8 "], %[" #TEMP8 "], 21000 \n\t" \
"subu %[" #TEMP8 "], %[" #TEMP8 "], %[temp18] \n\t" \
"sra %[" #TEMP12 "], %[" #TEMP12 "], 16 \n\t" \
"sra %[" #TEMP8 "], %[" #TEMP8 "], 16 \n\t" \
"addiu %[temp16], %[" #TEMP12 "], 1 \n\t" \
"movn %[" #TEMP12 "], %[temp16], %[temp19] \n\t" \
"sh %[" #TEMP0 "], " #A "(%[temp20]) \n\t" \
"sh %[" #TEMP4 "], " #C "(%[temp20]) \n\t" \
"sh %[" #TEMP8 "], " #D "(%[temp20]) \n\t" \
"sh %[" #TEMP12 "], " #B "(%[temp20]) \n\t"
static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
int temp17, temp18, temp19, temp20;
const int c2217 = 2217;
const int c5352 = 5352;
const int* const args[3] =
{ (const int*)src, (const int*)ref, (const int*)out };
__asm__ volatile(
HORIZONTAL_PASS(0, temp0, temp1, temp2, temp3)
HORIZONTAL_PASS(1, temp4, temp5, temp6, temp7)
HORIZONTAL_PASS(2, temp8, temp9, temp10, temp11)
HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
"lw %[temp20], 8(%[args]) \n\t"
VERTICAL_PASS(0, 8, 16, 24, temp0, temp4, temp8, temp12)
VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9, temp13)
VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
[temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11),
[temp12]"=&r"(temp12), [temp13]"=&r"(temp13), [temp14]"=&r"(temp14),
[temp15]"=&r"(temp15), [temp16]"=&r"(temp16), [temp17]"=&r"(temp17),
[temp18]"=&r"(temp18), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
: [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
: "memory", "hi", "lo"
);
}
#undef VERTICAL_PASS
#undef HORIZONTAL_PASS
#if !defined(WORK_AROUND_GCC)
#define GET_SSE_INNER(A, B, C, D) \
"lbu %[temp0], " #A "(%[a]) \n\t" \
"lbu %[temp1], " #A "(%[b]) \n\t" \
"lbu %[temp2], " #B "(%[a]) \n\t" \
"lbu %[temp3], " #B "(%[b]) \n\t" \
"lbu %[temp4], " #C "(%[a]) \n\t" \
"lbu %[temp5], " #C "(%[b]) \n\t" \
"lbu %[temp6], " #D "(%[a]) \n\t" \
"lbu %[temp7], " #D "(%[b]) \n\t" \
"subu %[temp0], %[temp0], %[temp1] \n\t" \
"subu %[temp2], %[temp2], %[temp3] \n\t" \
"subu %[temp4], %[temp4], %[temp5] \n\t" \
"subu %[temp6], %[temp6], %[temp7] \n\t" \
"madd %[temp0], %[temp0] \n\t" \
"madd %[temp2], %[temp2] \n\t" \
"madd %[temp4], %[temp4] \n\t" \
"madd %[temp6], %[temp6] \n\t"
#define GET_SSE(A, B, C, D) \
GET_SSE_INNER(A, A + 1, A + 2, A + 3) \
GET_SSE_INNER(B, B + 1, B + 2, B + 3) \
GET_SSE_INNER(C, C + 1, C + 2, C + 3) \
GET_SSE_INNER(D, D + 1, D + 2, D + 3)
static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
GET_SSE( 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS)
GET_SSE( 9 * BPS, 4 + 9 * BPS, 8 + 9 * BPS, 12 + 9 * BPS)
GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE( 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS)
GET_SSE( 1 * BPS, 4 + 1 * BPS, 8 + 1 * BPS, 12 + 1 * BPS)
GET_SSE( 2 * BPS, 4 + 2 * BPS, 8 + 2 * BPS, 12 + 2 * BPS)
GET_SSE( 3 * BPS, 4 + 3 * BPS, 8 + 3 * BPS, 12 + 3 * BPS)
GET_SSE( 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS)
GET_SSE( 5 * BPS, 4 + 5 * BPS, 8 + 5 * BPS, 12 + 5 * BPS)
GET_SSE( 6 * BPS, 4 + 6 * BPS, 8 + 6 * BPS, 12 + 6 * BPS)
GET_SSE( 7 * BPS, 4 + 7 * BPS, 8 + 7 * BPS, 12 + 7 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) {
int count;
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
__asm__ volatile(
"mult $zero, $zero \n\t"
GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
"mflo %[count] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
: [a]"r"(a), [b]"r"(b)
: "memory", "hi", "lo"
);
return count;
}
#undef GET_SSE
#undef GET_SSE_INNER
#endif // !WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
VP8ITransform = ITransform_MIPS32;
VP8FTransform = FTransform_MIPS32;
VP8EncQuantizeBlock = QuantizeBlock_MIPS32;
VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32;
VP8TDisto4x4 = Disto4x4_MIPS32;
VP8TDisto16x16 = Disto16x16_MIPS32;
#if !defined(WORK_AROUND_GCC)
VP8SSE16x16 = SSE16x16_MIPS32;
VP8SSE8x8 = SSE8x8_MIPS32;
VP8SSE16x8 = SSE16x8_MIPS32;
VP8SSE4x4 = SSE4x4_MIPS32;
#endif
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
#endif // WEBP_USE_MIPS32

File diff suppressed because it is too large Load Diff

896
vendor/github.com/sizeofint/webpanimation/dsp_enc_msa.c generated vendored Normal file
View File

@ -0,0 +1,896 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MSA version of encoder dsp functions.
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MSA)
#include <stdlib.h>
#include "dsp_msa_macro.h"
#include "enc_vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms
#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do { \
v4i32 a1_m, b1_m, c1_m, d1_m; \
const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \
const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \
v4i32 c_tmp1_m = in1 * sinpi8sqrt2; \
v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1; \
v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1; \
v4i32 d_tmp2_m = in3 * sinpi8sqrt2; \
\
ADDSUB2(in0, in2, a1_m, b1_m); \
SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16); \
c_tmp2_m = c_tmp2_m + in3; \
c1_m = c_tmp1_m - c_tmp2_m; \
SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16); \
d_tmp1_m = d_tmp1_m + in1; \
d1_m = d_tmp1_m + d_tmp2_m; \
BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
} while (0)
static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
uint8_t* dst) {
v8i16 input0, input1;
v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
v4i32 res0, res1, res2, res3;
v16i8 dest0, dest1, dest2, dest3;
const v16i8 zero = { 0 };
LD_SH2(in, 8, input0, input1);
UNPCK_SH_SW(input0, in0, in1);
UNPCK_SH_SW(input1, in2, in3);
IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
res0, res1, res2, res3);
ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
res0, res1, res2, res3);
ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
CLIP_SW4_0_255(res0, res1, res2, res3);
PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
}
static void ITransform_MSA(const uint8_t* ref, const int16_t* in, uint8_t* dst,
int do_two) {
ITransformOne(ref, in, dst);
if (do_two) {
ITransformOne(ref + 4, in + 16, dst + 4);
}
}
static void FTransform_MSA(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
uint64_t out0, out1, out2, out3;
uint32_t in0, in1, in2, in3;
v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
v8i16 t0, t1, t2, t3;
v16u8 srcl0, srcl1, src0 = { 0 }, src1 = { 0 };
const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
LW4(src, BPS, in0, in1, in2, in3);
INSERT_W4_UB(in0, in1, in2, in3, src0);
LW4(ref, BPS, in0, in1, in2, in3);
INSERT_W4_UB(in0, in1, in2, in3, src1);
ILVRL_B2_UB(src0, src1, srcl0, srcl1);
HSUB_UB2_SH(srcl0, srcl1, t0, t1);
VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
ADDSUB2(t2, t3, t0, t1);
t0 = SRLI_H(t0, 3);
VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
tmp0 = __msa_hadd_s_w(t3, t3);
tmp2 = __msa_hsub_s_w(t3, t3);
FILL_W2_SW(1812, 937, tmp1, tmp3);
DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
SRAI_W2_SW(tmp1, tmp3, 9);
PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
ADDSUB2(t2, t3, t0, t1);
VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
tmp0 = __msa_hadd_s_w(t3, t3);
tmp2 = __msa_hsub_s_w(t3, t3);
ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
SRAI_W2_SW(tmp0, tmp2, 4);
FILL_W2_SW(12000, 51000, tmp1, tmp3);
DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
SRAI_W2_SW(tmp1, tmp3, 16);
UNPCK_R_SH_SW(t1, tmp4);
tmp5 = __msa_ceqi_w(tmp4, 0);
tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
tmp5 = __msa_fill_w(1);
tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
tmp1 += tmp5;
PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
out0 = __msa_copy_s_d((v2i64)t0, 0);
out1 = __msa_copy_s_d((v2i64)t0, 1);
out2 = __msa_copy_s_d((v2i64)t1, 0);
out3 = __msa_copy_s_d((v2i64)t1, 1);
SD4(out0, out1, out2, out3, out, 8);
}
static void FTransformWHT_MSA(const int16_t* in, int16_t* out) {
v8i16 in0 = { 0 };
v8i16 in1 = { 0 };
v8i16 tmp0, tmp1, tmp2, tmp3;
v8i16 out0, out1;
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
in0 = __msa_insert_h(in0, 0, in[ 0]);
in0 = __msa_insert_h(in0, 1, in[ 64]);
in0 = __msa_insert_h(in0, 2, in[128]);
in0 = __msa_insert_h(in0, 3, in[192]);
in0 = __msa_insert_h(in0, 4, in[ 16]);
in0 = __msa_insert_h(in0, 5, in[ 80]);
in0 = __msa_insert_h(in0, 6, in[144]);
in0 = __msa_insert_h(in0, 7, in[208]);
in1 = __msa_insert_h(in1, 0, in[ 48]);
in1 = __msa_insert_h(in1, 1, in[112]);
in1 = __msa_insert_h(in1, 2, in[176]);
in1 = __msa_insert_h(in1, 3, in[240]);
in1 = __msa_insert_h(in1, 4, in[ 32]);
in1 = __msa_insert_h(in1, 5, in[ 96]);
in1 = __msa_insert_h(in1, 6, in[160]);
in1 = __msa_insert_h(in1, 7, in[224]);
ADDSUB2(in0, in1, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
ADDSUB2(tmp2, tmp3, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
ADDSUB2(in0, in1, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
ADDSUB2(tmp2, tmp3, out0, out1);
SRAI_H2_SH(out0, out1, 1);
ST_SH2(out0, out1, out, 8);
}
static int TTransform_MSA(const uint8_t* in, const uint16_t* w) {
int sum;
uint32_t in0_m, in1_m, in2_m, in3_m;
v16i8 src0 = { 0 };
v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
v4i32 dst0, dst1;
const v16i8 zero = { 0 };
const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
ILVRL_B2_SH(zero, src0, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
ADDSUB2(in0, in1, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
ADDSUB2(tmp2, tmp3, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
ADDSUB2(in0, in1, tmp0, tmp1);
VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
ADDSUB2(tmp2, tmp3, tmp0, tmp1);
tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
LD_SH2(w, 8, tmp2, tmp3);
DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
dst0 = dst0 + dst1;
sum = HADD_SW_S32(dst0);
return sum;
}
static int Disto4x4_MSA(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int sum1 = TTransform_MSA(a, w);
const int sum2 = TTransform_MSA(b, w);
return abs(sum2 - sum1) >> 5;
}
static int Disto16x16_MSA(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_MSA(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
// Histogram
static void CollectHistogram_MSA(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
{
int k;
v8i16 coeff0, coeff1;
const v8i16 zero = { 0 };
const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
LD_SH2(&out[0], 8, coeff0, coeff1);
coeff0 = __msa_add_a_h(coeff0, zero);
coeff1 = __msa_add_a_h(coeff1, zero);
SRAI_H2_SH(coeff0, coeff1, 3);
coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
ST_SH2(coeff0, coeff1, &out[0], 8);
for (k = 0; k < 16; ++k) {
++distribution[out[k]];
}
}
}
VP8SetHistogramData(distribution, histo);
}
//------------------------------------------------------------------------------
// Intra predictions
// luma 4x4 prediction
#define DST(x, y) dst[(x) + (y) * BPS]
#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
#define AVG2(a, b) (((a) + (b) + 1) >> 1)
static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) { // vertical
const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top - 1);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C = SLDI_UB(A, A, 2);
const v16u8 AC = __msa_ave_u_b(A, C);
const v16u8 B2 = __msa_ave_u_b(B, B);
const v16u8 R = __msa_aver_u_b(AC, B2);
const uint32_t out = __msa_copy_s_w((v4i32)R, 0);
SW4(out, out, out, out, dst, BPS);
}
static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) { // horizontal
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
}
static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
uint32_t dc = 4;
int i;
for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
dc >>= 3;
dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
SW4(dc, dc, dc, dc, dst, BPS);
}
static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
const v16u8 A2 = { 0 };
const uint64_t val_m = LD(top - 5);
const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A2, 0, val_m);
const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C = SLDI_UB(A, A, 2);
const v16u8 AC = __msa_ave_u_b(A, C);
const v16u8 B2 = __msa_ave_u_b(B, B);
const v16u8 R0 = __msa_aver_u_b(AC, B2);
const v16u8 R1 = SLDI_UB(R0, R0, 1);
const v16u8 R2 = SLDI_UB(R1, R1, 1);
const v16u8 R3 = SLDI_UB(R2, R2, 1);
const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
SW4(val3, val2, val1, val0, dst, BPS);
}
static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
const v16u8 A1 = { 0 };
const uint64_t val_m = LD(top);
const v16u8 A = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
const v16u8 B = SLDI_UB(A, A, 1);
const v16u8 C1 = SLDI_UB(A, A, 2);
const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
const v16u8 AC = __msa_ave_u_b(A, C);
const v16u8 B2 = __msa_ave_u_b(B, B);
const v16u8 R0 = __msa_aver_u_b(AC, B2);
const v16u8 R1 = SLDI_UB(R0, R0, 1);
const v16u8 R2 = SLDI_UB(R1, R1, 1);
const v16u8 R3 = SLDI_UB(R2, R2, 1);
const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
SW4(val0, val1, val2, val3, dst, BPS);
}
static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
DST(0, 0) = DST(1, 2) = AVG2(X, A);
DST(1, 0) = DST(2, 2) = AVG2(A, B);
DST(2, 0) = DST(3, 2) = AVG2(B, C);
DST(3, 0) = AVG2(C, D);
DST(0, 3) = AVG3(K, J, I);
DST(0, 2) = AVG3(J, I, X);
DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
DST(3, 1) = AVG3(B, C, D);
}
static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
const int A = top[0];
const int B = top[1];
const int C = top[2];
const int D = top[3];
const int E = top[4];
const int F = top[5];
const int G = top[6];
const int H = top[7];
DST(0, 0) = AVG2(A, B);
DST(1, 0) = DST(0, 2) = AVG2(B, C);
DST(2, 0) = DST(1, 2) = AVG2(C, D);
DST(3, 0) = DST(2, 2) = AVG2(D, E);
DST(0, 1) = AVG3(A, B, C);
DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
DST(3, 2) = AVG3(E, F, G);
DST(3, 3) = AVG3(F, G, H);
}
static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
DST(0, 0) = AVG2(I, J);
DST(2, 0) = DST(0, 1) = AVG2(J, K);
DST(2, 1) = DST(0, 2) = AVG2(K, L);
DST(1, 0) = AVG3(I, J, K);
DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
DST(3, 2) = DST(2, 2) =
DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
}
static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
const int X = top[-1];
const int I = top[-2];
const int J = top[-3];
const int K = top[-4];
const int L = top[-5];
const int A = top[0];
const int B = top[1];
const int C = top[2];
DST(0, 0) = DST(2, 1) = AVG2(I, X);
DST(0, 1) = DST(2, 2) = AVG2(J, I);
DST(0, 2) = DST(2, 3) = AVG2(K, J);
DST(0, 3) = AVG2(L, K);
DST(3, 0) = AVG3(A, B, C);
DST(2, 0) = AVG3(X, A, B);
DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
DST(1, 3) = AVG3(L, K, J);
}
static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
const v16i8 zero = { 0 };
const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
const v16u8 T1 = LD_UB(top);
const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
const v8i16 d = T - TL;
v8i16 r0, r1, r2, r3;
ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
CLIP_SH4_0_255(r0, r1, r2, r3);
PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
}
#undef DST
#undef AVG3
#undef AVG2
static void Intra4Preds_MSA(uint8_t* dst, const uint8_t* top) {
DC4(I4DC4 + dst, top);
TM4(I4TM4 + dst, top);
VE4(I4VE4 + dst, top);
HE4(I4HE4 + dst, top);
RD4(I4RD4 + dst, top);
VR4(I4VR4 + dst, top);
LD4(I4LD4 + dst, top);
VL4(I4VL4 + dst, top);
HD4(I4HD4 + dst, top);
HU4(I4HU4 + dst, top);
}
// luma 16x16 prediction
#define STORE16x16(out, dst) do { \
ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS); \
ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS); \
} while (0)
static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
if (top != NULL) {
const v16u8 out = LD_UB(top);
STORE16x16(out, dst);
} else {
const v16u8 out = (v16u8)__msa_fill_b(0x7f);
STORE16x16(out, dst);
}
}
static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
const uint8_t* left) {
if (left != NULL) {
int j;
for (j = 0; j < 16; j += 4) {
const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
ST_UB4(L0, L1, L2, L3, dst, BPS);
dst += 4 * BPS;
left += 4;
}
} else {
const v16u8 out = (v16u8)__msa_fill_b(0x81);
STORE16x16(out, dst);
}
}
static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
if (left != NULL) {
if (top != NULL) {
int j;
v8i16 d1, d2;
const v16i8 zero = { 0 };
const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
const v16u8 T = LD_UB(top);
ILVRL_B2_SH(zero, T, d1, d2);
SUB2(d1, TL, d2, TL, d1, d2);
for (j = 0; j < 16; j += 4) {
v16i8 t0, t1, t2, t3;
v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
CLIP_SH4_0_255(r0, r1, r2, r3);
CLIP_SH4_0_255(r4, r5, r6, r7);
PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
ST_SB4(t0, t1, t2, t3, dst, BPS);
dst += 4 * BPS;
}
} else {
HorizontalPred16x16(dst, left);
}
} else {
if (top != NULL) {
VerticalPred16x16(dst, top);
} else {
const v16u8 out = (v16u8)__msa_fill_b(0x81);
STORE16x16(out, dst);
}
}
}
static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
int DC;
v16u8 out;
if (top != NULL && left != NULL) {
const v16u8 rtop = LD_UB(top);
const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
const v16u8 rleft = LD_UB(left);
const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
const v8u16 dctemp = dctop + dcleft;
DC = HADD_UH_U32(dctemp);
DC = (DC + 16) >> 5;
} else if (left != NULL) { // left but no top
const v16u8 rleft = LD_UB(left);
const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
DC = HADD_UH_U32(dcleft);
DC = (DC + DC + 16) >> 5;
} else if (top != NULL) { // top but no left
const v16u8 rtop = LD_UB(top);
const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
DC = HADD_UH_U32(dctop);
DC = (DC + DC + 16) >> 5;
} else { // no top, no left, nothing.
DC = 0x80;
}
out = (v16u8)__msa_fill_b(DC);
STORE16x16(out, dst);
}
static void Intra16Preds_MSA(uint8_t* dst,
const uint8_t* left, const uint8_t* top) {
DCMode16x16(I16DC16 + dst, left, top);
VerticalPred16x16(I16VE16 + dst, top);
HorizontalPred16x16(I16HE16 + dst, left);
TrueMotion16x16(I16TM16 + dst, left, top);
}
// Chroma 8x8 prediction
#define CALC_DC8(in, out) do { \
const v8u16 temp0 = __msa_hadd_u_h(in, in); \
const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0); \
const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1); \
const v2i64 temp3 = __msa_splati_d(temp2, 1); \
const v2i64 temp4 = temp3 + temp2; \
const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4); \
const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0); \
out = __msa_copy_s_d(temp6, 0); \
} while (0)
#define STORE8x8(out, dst) do { \
SD4(out, out, out, out, dst + 0 * BPS, BPS); \
SD4(out, out, out, out, dst + 4 * BPS, BPS); \
} while (0)
static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
if (top != NULL) {
const uint64_t out = LD(top);
STORE8x8(out, dst);
} else {
const uint64_t out = 0x7f7f7f7f7f7f7f7fULL;
STORE8x8(out, dst);
}
}
static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
if (left != NULL) {
int j;
for (j = 0; j < 8; j += 4) {
const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
SD4(out0, out1, out2, out3, dst, BPS);
dst += 4 * BPS;
left += 4;
}
} else {
const uint64_t out = 0x8181818181818181ULL;
STORE8x8(out, dst);
}
}
static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
if (left != NULL) {
if (top != NULL) {
int j;
const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
const v16u8 T1 = LD_UB(top);
const v16i8 zero = { 0 };
const v8i16 T = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
const v8i16 d = T - TL;
for (j = 0; j < 8; j += 4) {
uint64_t out0, out1, out2, out3;
v16i8 t0, t1;
v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]);
v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]);
v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]);
v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]);
ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
CLIP_SH4_0_255(r0, r1, r2, r3);
PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
out0 = __msa_copy_s_d((v2i64)t0, 0);
out1 = __msa_copy_s_d((v2i64)t0, 1);
out2 = __msa_copy_s_d((v2i64)t1, 0);
out3 = __msa_copy_s_d((v2i64)t1, 1);
SD4(out0, out1, out2, out3, dst, BPS);
dst += 4 * BPS;
}
} else {
HorizontalPred8x8(dst, left);
}
} else {
if (top != NULL) {
VerticalPred8x8(dst, top);
} else {
const uint64_t out = 0x8181818181818181ULL;
STORE8x8(out, dst);
}
}
}
static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
uint64_t out;
v16u8 src = { 0 };
if (top != NULL && left != NULL) {
const uint64_t left_m = LD(left);
const uint64_t top_m = LD(top);
INSERT_D2_UB(left_m, top_m, src);
CALC_DC8(src, out);
} else if (left != NULL) { // left but no top
const uint64_t left_m = LD(left);
INSERT_D2_UB(left_m, left_m, src);
CALC_DC8(src, out);
} else if (top != NULL) { // top but no left
const uint64_t top_m = LD(top);
INSERT_D2_UB(top_m, top_m, src);
CALC_DC8(src, out);
} else { // no top, no left, nothing.
src = (v16u8)__msa_fill_b(0x80);
out = __msa_copy_s_d((v2i64)src, 0);
}
STORE8x8(out, dst);
}
static void IntraChromaPreds_MSA(uint8_t* dst, const uint8_t* left,
const uint8_t* top) {
// U block
DCMode8x8(C8DC8 + dst, left, top);
VerticalPred8x8(C8VE8 + dst, top);
HorizontalPred8x8(C8HE8 + dst, left);
TrueMotion8x8(C8TM8 + dst, left, top);
// V block
dst += 8;
if (top != NULL) top += 8;
if (left != NULL) left += 16;
DCMode8x8(C8DC8 + dst, left, top);
VerticalPred8x8(C8VE8 + dst, top);
HorizontalPred8x8(C8HE8 + dst, left);
TrueMotion8x8(C8TM8 + dst, left, top);
}
//------------------------------------------------------------------------------
// Metric
#define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
v16u8 tmp0, tmp1; \
v8i16 tmp2, tmp3; \
ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
} while (0)
#define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do { \
v16u8 tmp0, tmp1; \
v8i16 tmp2, tmp3; \
ILVRL_B2_UB(in0, in1, tmp0, tmp1); \
HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1); \
ILVRL_B2_UB(in2, in3, tmp0, tmp1); \
HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3); \
DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3); \
} while (0)
static int SSE16x16_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
v4i32 out0, out1, out2, out3;
LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
a += 8 * BPS;
b += 8 * BPS;
LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
out0 += out1;
out2 += out3;
out0 += out2;
sum = HADD_SW_S32(out0);
return sum;
}
static int SSE16x8_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
v4i32 out0, out1, out2, out3;
LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
out0 += out1;
out2 += out3;
out0 += out2;
sum = HADD_SW_S32(out0);
return sum;
}
static int SSE8x8_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum;
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
v16u8 t0, t1, t2, t3;
v4i32 out0, out1, out2, out3;
LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
out0 += out1;
out2 += out3;
out0 += out2;
sum = HADD_SW_S32(out0);
return sum;
}
static int SSE4x4_MSA(const uint8_t* a, const uint8_t* b) {
uint32_t sum = 0;
uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
v16u8 src = { 0 }, ref = { 0 }, tmp0, tmp1;
v8i16 diff0, diff1;
v4i32 out0, out1;
LW4(a, BPS, src0, src1, src2, src3);
LW4(b, BPS, ref0, ref1, ref2, ref3);
INSERT_W4_UB(src0, src1, src2, src3, src);
INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
ILVRL_B2_UB(src, ref, tmp0, tmp1);
HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
out0 += out1;
sum = HADD_SW_S32(out0);
return sum;
}
//------------------------------------------------------------------------------
// Quantization
static int QuantizeBlock_MSA(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
int sum;
v8i16 in0, in1, sh0, sh1, out0, out1;
v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
const v8i16 zero = { 0 };
const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
LD_SH2(&in[0], 8, in0, in1);
LD_SH2(&mtx->sharpen_[0], 8, sh0, sh1);
tmp4 = __msa_add_a_h(in0, zero);
tmp5 = __msa_add_a_h(in1, zero);
ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
sign0 = (in0 < zero);
sign1 = (in1 < zero); // sign
LD_SH2(&mtx->iq_[0], 8, tmp0, tmp1); // iq
ILVRL_H2_SW(zero, tmp0, t0, t1);
ILVRL_H2_SW(zero, tmp1, t2, t3);
LD_SW4(&mtx->bias_[0], 4, b0, b1, b2, b3); // bias
MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
SRAI_W4_SW(b0, b1, b2, b3, 17);
PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
tmp0 = (tmp2 > maxlevel);
tmp1 = (tmp3 > maxlevel);
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
SUB2(zero, tmp2, zero, tmp3, tmp0, tmp1);
tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3); // zthresh
t0 = (s0 > t0);
t1 = (s1 > t1);
t2 = (s2 > t2);
t3 = (s3 > t3);
PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
LD_SH2(&mtx->q_[0], 8, tmp0, tmp1);
MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
ST_SH2(in0, in1, &in[0], 8);
ST_SH2(out0, out1, &out[0], 8);
out0 = __msa_add_a_h(out0, out1);
sum = HADD_SH_S32(out0);
return (sum > 0);
}
static int Quantize2Blocks_MSA(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
VP8ITransform = ITransform_MSA;
VP8FTransform = FTransform_MSA;
VP8FTransformWHT = FTransformWHT_MSA;
VP8TDisto4x4 = Disto4x4_MSA;
VP8TDisto16x16 = Disto16x16_MSA;
VP8CollectHistogram = CollectHistogram_MSA;
VP8EncPredLuma4 = Intra4Preds_MSA;
VP8EncPredLuma16 = Intra16Preds_MSA;
VP8EncPredChroma8 = IntraChromaPreds_MSA;
VP8SSE16x16 = SSE16x16_MSA;
VP8SSE16x8 = SSE16x8_MSA;
VP8SSE8x8 = SSE8x8_MSA;
VP8SSE4x4 = SSE4x4_MSA;
VP8EncQuantizeBlock = QuantizeBlock_MSA;
VP8EncQuantize2Blocks = Quantize2Blocks_MSA;
VP8EncQuantizeBlockWHT = QuantizeBlock_MSA;
}
#else // !WEBP_USE_MSA
WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
#endif // WEBP_USE_MSA

View File

@ -0,0 +1,938 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// ARM NEON version of speed-critical encoding functions.
//
// adapted from libvpx (http://www.webmproject.org/code/)
#include "dsp_dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include "dsp_neon.h"
#include "enc_vp8i_enc.h"
//------------------------------------------------------------------------------
// Transforms (Paragraph 14.4)
// Inverse transform.
// This code is pretty much the same as TransformOne in the dec_neon.c, except
// for subtraction to *ref. See the comments there for algorithmic explanations.
static const int16_t kC1 = 20091;
static const int16_t kC2 = 17734; // half of kC2, actually. See comment above.
// This code works but is *slower* than the inlined-asm version below
// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
// WEBP_USE_INTRINSICS define.
// With gcc-4.8, it's a little faster speed than inlined-assembly.
#if defined(WEBP_USE_INTRINSICS)
// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
static WEBP_INLINE int16x8_t ConvertU8ToS16_NEON(uint32x2_t v) {
return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
}
// Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
// to the corresponding rows of 'dst'.
static WEBP_INLINE void SaturateAndStore4x4_NEON(uint8_t* const dst,
const int16x8_t dst01,
const int16x8_t dst23) {
// Unsigned saturate to 8b.
const uint8x8_t dst01_u8 = vqmovun_s16(dst01);
const uint8x8_t dst23_u8 = vqmovun_s16(dst23);
// Store the results.
vst1_lane_u32((uint32_t*)(dst + 0 * BPS), vreinterpret_u32_u8(dst01_u8), 0);
vst1_lane_u32((uint32_t*)(dst + 1 * BPS), vreinterpret_u32_u8(dst01_u8), 1);
vst1_lane_u32((uint32_t*)(dst + 2 * BPS), vreinterpret_u32_u8(dst23_u8), 0);
vst1_lane_u32((uint32_t*)(dst + 3 * BPS), vreinterpret_u32_u8(dst23_u8), 1);
}
static WEBP_INLINE void Add4x4_NEON(const int16x8_t row01,
const int16x8_t row23,
const uint8_t* const ref,
uint8_t* const dst) {
uint32x2_t dst01 = vdup_n_u32(0);
uint32x2_t dst23 = vdup_n_u32(0);
// Load the source pixels.
dst01 = vld1_lane_u32((uint32_t*)(ref + 0 * BPS), dst01, 0);
dst23 = vld1_lane_u32((uint32_t*)(ref + 2 * BPS), dst23, 0);
dst01 = vld1_lane_u32((uint32_t*)(ref + 1 * BPS), dst01, 1);
dst23 = vld1_lane_u32((uint32_t*)(ref + 3 * BPS), dst23, 1);
{
// Convert to 16b.
const int16x8_t dst01_s16 = ConvertU8ToS16_NEON(dst01);
const int16x8_t dst23_s16 = ConvertU8ToS16_NEON(dst23);
// Descale with rounding.
const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
const int16x8_t out23 = vrsraq_n_s16(dst23_s16, row23, 3);
// Add the inverse transform.
SaturateAndStore4x4_NEON(dst, out01, out23);
}
}
static WEBP_INLINE void Transpose8x2_NEON(const int16x8_t in0,
const int16x8_t in1,
int16x8x2_t* const out) {
// a0 a1 a2 a3 | b0 b1 b2 b3 => a0 b0 c0 d0 | a1 b1 c1 d1
// c0 c1 c2 c3 | d0 d1 d2 d3 a2 b2 c2 d2 | a3 b3 c3 d3
const int16x8x2_t tmp0 = vzipq_s16(in0, in1); // a0 c0 a1 c1 a2 c2 ...
// b0 d0 b1 d1 b2 d2 ...
*out = vzipq_s16(tmp0.val[0], tmp0.val[1]);
}
static WEBP_INLINE void TransformPass_NEON(int16x8x2_t* const rows) {
// {rows} = in0 | in4
// in8 | in12
// B1 = in4 | in12
const int16x8_t B1 =
vcombine_s16(vget_high_s16(rows->val[0]), vget_high_s16(rows->val[1]));
// C0 = kC1 * in4 | kC1 * in12
// C1 = kC2 * in4 | kC2 * in12
const int16x8_t C0 = vsraq_n_s16(B1, vqdmulhq_n_s16(B1, kC1), 1);
const int16x8_t C1 = vqdmulhq_n_s16(B1, kC2);
const int16x4_t a = vqadd_s16(vget_low_s16(rows->val[0]),
vget_low_s16(rows->val[1])); // in0 + in8
const int16x4_t b = vqsub_s16(vget_low_s16(rows->val[0]),
vget_low_s16(rows->val[1])); // in0 - in8
// c = kC2 * in4 - kC1 * in12
// d = kC1 * in4 + kC2 * in12
const int16x4_t c = vqsub_s16(vget_low_s16(C1), vget_high_s16(C0));
const int16x4_t d = vqadd_s16(vget_low_s16(C0), vget_high_s16(C1));
const int16x8_t D0 = vcombine_s16(a, b); // D0 = a | b
const int16x8_t D1 = vcombine_s16(d, c); // D1 = d | c
const int16x8_t E0 = vqaddq_s16(D0, D1); // a+d | b+c
const int16x8_t E_tmp = vqsubq_s16(D0, D1); // a-d | b-c
const int16x8_t E1 = vcombine_s16(vget_high_s16(E_tmp), vget_low_s16(E_tmp));
Transpose8x2_NEON(E0, E1, rows);
}
static void ITransformOne_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
int16x8x2_t rows;
INIT_VECTOR2(rows, vld1q_s16(in + 0), vld1q_s16(in + 8));
TransformPass_NEON(&rows);
TransformPass_NEON(&rows);
Add4x4_NEON(rows.val[0], rows.val[1], ref, dst);
}
#else
static void ITransformOne_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst) {
const int kBPS = BPS;
const int16_t kC1C2[] = { kC1, kC2, 0, 0 };
__asm__ volatile (
"vld1.16 {q1, q2}, [%[in]] \n"
"vld1.16 {d0}, [%[kC1C2]] \n"
// d2: in[0]
// d3: in[8]
// d4: in[4]
// d5: in[12]
"vswp d3, d4 \n"
// q8 = {in[4], in[12]} * kC1 * 2 >> 16
// q9 = {in[4], in[12]} * kC2 >> 16
"vqdmulh.s16 q8, q2, d0[0] \n"
"vqdmulh.s16 q9, q2, d0[1] \n"
// d22 = a = in[0] + in[8]
// d23 = b = in[0] - in[8]
"vqadd.s16 d22, d2, d3 \n"
"vqsub.s16 d23, d2, d3 \n"
// q8 = in[4]/[12] * kC1 >> 16
"vshr.s16 q8, q8, #1 \n"
// Add {in[4], in[12]} back after the multiplication.
"vqadd.s16 q8, q2, q8 \n"
// d20 = c = in[4]*kC2 - in[12]*kC1
// d21 = d = in[4]*kC1 + in[12]*kC2
"vqsub.s16 d20, d18, d17 \n"
"vqadd.s16 d21, d19, d16 \n"
// d2 = tmp[0] = a + d
// d3 = tmp[1] = b + c
// d4 = tmp[2] = b - c
// d5 = tmp[3] = a - d
"vqadd.s16 d2, d22, d21 \n"
"vqadd.s16 d3, d23, d20 \n"
"vqsub.s16 d4, d23, d20 \n"
"vqsub.s16 d5, d22, d21 \n"
"vzip.16 q1, q2 \n"
"vzip.16 q1, q2 \n"
"vswp d3, d4 \n"
// q8 = {tmp[4], tmp[12]} * kC1 * 2 >> 16
// q9 = {tmp[4], tmp[12]} * kC2 >> 16
"vqdmulh.s16 q8, q2, d0[0] \n"
"vqdmulh.s16 q9, q2, d0[1] \n"
// d22 = a = tmp[0] + tmp[8]
// d23 = b = tmp[0] - tmp[8]
"vqadd.s16 d22, d2, d3 \n"
"vqsub.s16 d23, d2, d3 \n"
"vshr.s16 q8, q8, #1 \n"
"vqadd.s16 q8, q2, q8 \n"
// d20 = c = in[4]*kC2 - in[12]*kC1
// d21 = d = in[4]*kC1 + in[12]*kC2
"vqsub.s16 d20, d18, d17 \n"
"vqadd.s16 d21, d19, d16 \n"
// d2 = tmp[0] = a + d
// d3 = tmp[1] = b + c
// d4 = tmp[2] = b - c
// d5 = tmp[3] = a - d
"vqadd.s16 d2, d22, d21 \n"
"vqadd.s16 d3, d23, d20 \n"
"vqsub.s16 d4, d23, d20 \n"
"vqsub.s16 d5, d22, d21 \n"
"vld1.32 d6[0], [%[ref]], %[kBPS] \n"
"vld1.32 d6[1], [%[ref]], %[kBPS] \n"
"vld1.32 d7[0], [%[ref]], %[kBPS] \n"
"vld1.32 d7[1], [%[ref]], %[kBPS] \n"
"sub %[ref], %[ref], %[kBPS], lsl #2 \n"
// (val) + 4 >> 3
"vrshr.s16 d2, d2, #3 \n"
"vrshr.s16 d3, d3, #3 \n"
"vrshr.s16 d4, d4, #3 \n"
"vrshr.s16 d5, d5, #3 \n"
"vzip.16 q1, q2 \n"
"vzip.16 q1, q2 \n"
// Must accumulate before saturating
"vmovl.u8 q8, d6 \n"
"vmovl.u8 q9, d7 \n"
"vqadd.s16 q1, q1, q8 \n"
"vqadd.s16 q2, q2, q9 \n"
"vqmovun.s16 d0, q1 \n"
"vqmovun.s16 d1, q2 \n"
"vst1.32 d0[0], [%[dst]], %[kBPS] \n"
"vst1.32 d0[1], [%[dst]], %[kBPS] \n"
"vst1.32 d1[0], [%[dst]], %[kBPS] \n"
"vst1.32 d1[1], [%[dst]] \n"
: [in] "+r"(in), [dst] "+r"(dst) // modified registers
: [kBPS] "r"(kBPS), [kC1C2] "r"(kC1C2), [ref] "r"(ref) // constants
: "memory", "q0", "q1", "q2", "q8", "q9", "q10", "q11" // clobbered
);
}
#endif // WEBP_USE_INTRINSICS
static void ITransform_NEON(const uint8_t* ref,
const int16_t* in, uint8_t* dst, int do_two) {
ITransformOne_NEON(ref, in, dst);
if (do_two) {
ITransformOne_NEON(ref + 4, in + 16, dst + 4);
}
}
// Load all 4x4 pixels into a single uint8x16_t variable.
static uint8x16_t Load4x4_NEON(const uint8_t* src) {
uint32x4_t out = vdupq_n_u32(0);
out = vld1q_lane_u32((const uint32_t*)(src + 0 * BPS), out, 0);
out = vld1q_lane_u32((const uint32_t*)(src + 1 * BPS), out, 1);
out = vld1q_lane_u32((const uint32_t*)(src + 2 * BPS), out, 2);
out = vld1q_lane_u32((const uint32_t*)(src + 3 * BPS), out, 3);
return vreinterpretq_u8_u32(out);
}
// Forward transform.
#if defined(WEBP_USE_INTRINSICS)
static WEBP_INLINE void Transpose4x4_S16_NEON(const int16x4_t A,
const int16x4_t B,
const int16x4_t C,
const int16x4_t D,
int16x8_t* const out01,
int16x8_t* const out32) {
const int16x4x2_t AB = vtrn_s16(A, B);
const int16x4x2_t CD = vtrn_s16(C, D);
const int32x2x2_t tmp02 = vtrn_s32(vreinterpret_s32_s16(AB.val[0]),
vreinterpret_s32_s16(CD.val[0]));
const int32x2x2_t tmp13 = vtrn_s32(vreinterpret_s32_s16(AB.val[1]),
vreinterpret_s32_s16(CD.val[1]));
*out01 = vreinterpretq_s16_s64(
vcombine_s64(vreinterpret_s64_s32(tmp02.val[0]),
vreinterpret_s64_s32(tmp13.val[0])));
*out32 = vreinterpretq_s16_s64(
vcombine_s64(vreinterpret_s64_s32(tmp13.val[1]),
vreinterpret_s64_s32(tmp02.val[1])));
}
static WEBP_INLINE int16x8_t DiffU8ToS16_NEON(const uint8x8_t a,
const uint8x8_t b) {
return vreinterpretq_s16_u16(vsubl_u8(a, b));
}
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
int16x8_t d0d1, d3d2; // working 4x4 int16 variables
{
const uint8x16_t S0 = Load4x4_NEON(src);
const uint8x16_t R0 = Load4x4_NEON(ref);
const int16x8_t D0D1 = DiffU8ToS16_NEON(vget_low_u8(S0), vget_low_u8(R0));
const int16x8_t D2D3 = DiffU8ToS16_NEON(vget_high_u8(S0), vget_high_u8(R0));
const int16x4_t D0 = vget_low_s16(D0D1);
const int16x4_t D1 = vget_high_s16(D0D1);
const int16x4_t D2 = vget_low_s16(D2D3);
const int16x4_t D3 = vget_high_s16(D2D3);
Transpose4x4_S16_NEON(D0, D1, D2, D3, &d0d1, &d3d2);
}
{ // 1rst pass
const int32x4_t kCst937 = vdupq_n_s32(937);
const int32x4_t kCst1812 = vdupq_n_s32(1812);
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
const int16x8_t a0a1_2 = vshlq_n_s16(a0a1, 3);
const int16x4_t tmp0 = vadd_s16(vget_low_s16(a0a1_2),
vget_high_s16(a0a1_2));
const int16x4_t tmp2 = vsub_s16(vget_low_s16(a0a1_2),
vget_high_s16(a0a1_2));
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
const int16x4_t tmp1 = vshrn_n_s32(vaddq_s32(a2_p_a3, kCst1812), 9);
const int16x4_t tmp3 = vshrn_n_s32(vaddq_s32(a3_m_a2, kCst937), 9);
Transpose4x4_S16_NEON(tmp0, tmp1, tmp2, tmp3, &d0d1, &d3d2);
}
{ // 2nd pass
// the (1<<16) addition is for the replacement: a3!=0 <-> 1-(a3==0)
const int32x4_t kCst12000 = vdupq_n_s32(12000 + (1 << 16));
const int32x4_t kCst51000 = vdupq_n_s32(51000);
const int16x8_t a0a1 = vaddq_s16(d0d1, d3d2); // d0+d3 | d1+d2 (=a0|a1)
const int16x8_t a3a2 = vsubq_s16(d0d1, d3d2); // d0-d3 | d1-d2 (=a3|a2)
const int16x4_t a0_k7 = vadd_s16(vget_low_s16(a0a1), vdup_n_s16(7));
const int16x4_t out0 = vshr_n_s16(vadd_s16(a0_k7, vget_high_s16(a0a1)), 4);
const int16x4_t out2 = vshr_n_s16(vsub_s16(a0_k7, vget_high_s16(a0a1)), 4);
const int32x4_t a3_2217 = vmull_n_s16(vget_low_s16(a3a2), 2217);
const int32x4_t a2_2217 = vmull_n_s16(vget_high_s16(a3a2), 2217);
const int32x4_t a2_p_a3 = vmlal_n_s16(a2_2217, vget_low_s16(a3a2), 5352);
const int32x4_t a3_m_a2 = vmlsl_n_s16(a3_2217, vget_high_s16(a3a2), 5352);
const int16x4_t tmp1 = vaddhn_s32(a2_p_a3, kCst12000);
const int16x4_t out3 = vaddhn_s32(a3_m_a2, kCst51000);
const int16x4_t a3_eq_0 =
vreinterpret_s16_u16(vceq_s16(vget_low_s16(a3a2), vdup_n_s16(0)));
const int16x4_t out1 = vadd_s16(tmp1, a3_eq_0);
vst1_s16(out + 0, out0);
vst1_s16(out + 4, out1);
vst1_s16(out + 8, out2);
vst1_s16(out + 12, out3);
}
}
#else
// adapted from vp8/encoder/arm/neon/shortfdct_neon.asm
static const int16_t kCoeff16[] = {
5352, 5352, 5352, 5352, 2217, 2217, 2217, 2217
};
static const int32_t kCoeff32[] = {
1812, 1812, 1812, 1812,
937, 937, 937, 937,
12000, 12000, 12000, 12000,
51000, 51000, 51000, 51000
};
static void FTransform_NEON(const uint8_t* src, const uint8_t* ref,
int16_t* out) {
const int kBPS = BPS;
const uint8_t* src_ptr = src;
const uint8_t* ref_ptr = ref;
const int16_t* coeff16 = kCoeff16;
const int32_t* coeff32 = kCoeff32;
__asm__ volatile (
// load src into q4, q5 in high half
"vld1.8 {d8}, [%[src_ptr]], %[kBPS] \n"
"vld1.8 {d10}, [%[src_ptr]], %[kBPS] \n"
"vld1.8 {d9}, [%[src_ptr]], %[kBPS] \n"
"vld1.8 {d11}, [%[src_ptr]] \n"
// load ref into q6, q7 in high half
"vld1.8 {d12}, [%[ref_ptr]], %[kBPS] \n"
"vld1.8 {d14}, [%[ref_ptr]], %[kBPS] \n"
"vld1.8 {d13}, [%[ref_ptr]], %[kBPS] \n"
"vld1.8 {d15}, [%[ref_ptr]] \n"
// Pack the high values in to q4 and q6
"vtrn.32 q4, q5 \n"
"vtrn.32 q6, q7 \n"
// d[0-3] = src - ref
"vsubl.u8 q0, d8, d12 \n"
"vsubl.u8 q1, d9, d13 \n"
// load coeff16 into q8(d16=5352, d17=2217)
"vld1.16 {q8}, [%[coeff16]] \n"
// load coeff32 high half into q9 = 1812, q10 = 937
"vld1.32 {q9, q10}, [%[coeff32]]! \n"
// load coeff32 low half into q11=12000, q12=51000
"vld1.32 {q11,q12}, [%[coeff32]] \n"
// part 1
// Transpose. Register dN is the same as dN in C
"vtrn.32 d0, d2 \n"
"vtrn.32 d1, d3 \n"
"vtrn.16 d0, d1 \n"
"vtrn.16 d2, d3 \n"
"vadd.s16 d4, d0, d3 \n" // a0 = d0 + d3
"vadd.s16 d5, d1, d2 \n" // a1 = d1 + d2
"vsub.s16 d6, d1, d2 \n" // a2 = d1 - d2
"vsub.s16 d7, d0, d3 \n" // a3 = d0 - d3
"vadd.s16 d0, d4, d5 \n" // a0 + a1
"vshl.s16 d0, d0, #3 \n" // temp[0+i*4] = (a0+a1) << 3
"vsub.s16 d2, d4, d5 \n" // a0 - a1
"vshl.s16 d2, d2, #3 \n" // (temp[2+i*4] = (a0-a1) << 3
"vmlal.s16 q9, d7, d16 \n" // a3*5352 + 1812
"vmlal.s16 q10, d7, d17 \n" // a3*2217 + 937
"vmlal.s16 q9, d6, d17 \n" // a2*2217 + a3*5352 + 1812
"vmlsl.s16 q10, d6, d16 \n" // a3*2217 + 937 - a2*5352
// temp[1+i*4] = (d2*2217 + d3*5352 + 1812) >> 9
// temp[3+i*4] = (d3*2217 + 937 - d2*5352) >> 9
"vshrn.s32 d1, q9, #9 \n"
"vshrn.s32 d3, q10, #9 \n"
// part 2
// transpose d0=ip[0], d1=ip[4], d2=ip[8], d3=ip[12]
"vtrn.32 d0, d2 \n"
"vtrn.32 d1, d3 \n"
"vtrn.16 d0, d1 \n"
"vtrn.16 d2, d3 \n"
"vmov.s16 d26, #7 \n"
"vadd.s16 d4, d0, d3 \n" // a1 = ip[0] + ip[12]
"vadd.s16 d5, d1, d2 \n" // b1 = ip[4] + ip[8]
"vsub.s16 d6, d1, d2 \n" // c1 = ip[4] - ip[8]
"vadd.s16 d4, d4, d26 \n" // a1 + 7
"vsub.s16 d7, d0, d3 \n" // d1 = ip[0] - ip[12]
"vadd.s16 d0, d4, d5 \n" // op[0] = a1 + b1 + 7
"vsub.s16 d2, d4, d5 \n" // op[8] = a1 - b1 + 7
"vmlal.s16 q11, d7, d16 \n" // d1*5352 + 12000
"vmlal.s16 q12, d7, d17 \n" // d1*2217 + 51000
"vceq.s16 d4, d7, #0 \n"
"vshr.s16 d0, d0, #4 \n"
"vshr.s16 d2, d2, #4 \n"
"vmlal.s16 q11, d6, d17 \n" // c1*2217 + d1*5352 + 12000
"vmlsl.s16 q12, d6, d16 \n" // d1*2217 - c1*5352 + 51000
"vmvn d4, d4 \n" // !(d1 == 0)
// op[4] = (c1*2217 + d1*5352 + 12000)>>16
"vshrn.s32 d1, q11, #16 \n"
// op[4] += (d1!=0)
"vsub.s16 d1, d1, d4 \n"
// op[12]= (d1*2217 - c1*5352 + 51000)>>16
"vshrn.s32 d3, q12, #16 \n"
// set result to out array
"vst1.16 {q0, q1}, [%[out]] \n"
: [src_ptr] "+r"(src_ptr), [ref_ptr] "+r"(ref_ptr),
[coeff32] "+r"(coeff32) // modified registers
: [kBPS] "r"(kBPS), [coeff16] "r"(coeff16),
[out] "r"(out) // constants
: "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
"q10", "q11", "q12", "q13" // clobbered
);
}
#endif
#define LOAD_LANE_16b(VALUE, LANE) do { \
(VALUE) = vld1_lane_s16(src, (VALUE), (LANE)); \
src += stride; \
} while (0)
static void FTransformWHT_NEON(const int16_t* src, int16_t* out) {
const int stride = 16;
const int16x4_t zero = vdup_n_s16(0);
int32x4x4_t tmp0;
int16x4x4_t in;
INIT_VECTOR4(in, zero, zero, zero, zero);
LOAD_LANE_16b(in.val[0], 0);
LOAD_LANE_16b(in.val[1], 0);
LOAD_LANE_16b(in.val[2], 0);
LOAD_LANE_16b(in.val[3], 0);
LOAD_LANE_16b(in.val[0], 1);
LOAD_LANE_16b(in.val[1], 1);
LOAD_LANE_16b(in.val[2], 1);
LOAD_LANE_16b(in.val[3], 1);
LOAD_LANE_16b(in.val[0], 2);
LOAD_LANE_16b(in.val[1], 2);
LOAD_LANE_16b(in.val[2], 2);
LOAD_LANE_16b(in.val[3], 2);
LOAD_LANE_16b(in.val[0], 3);
LOAD_LANE_16b(in.val[1], 3);
LOAD_LANE_16b(in.val[2], 3);
LOAD_LANE_16b(in.val[3], 3);
{
// a0 = in[0 * 16] + in[2 * 16]
// a1 = in[1 * 16] + in[3 * 16]
// a2 = in[1 * 16] - in[3 * 16]
// a3 = in[0 * 16] - in[2 * 16]
const int32x4_t a0 = vaddl_s16(in.val[0], in.val[2]);
const int32x4_t a1 = vaddl_s16(in.val[1], in.val[3]);
const int32x4_t a2 = vsubl_s16(in.val[1], in.val[3]);
const int32x4_t a3 = vsubl_s16(in.val[0], in.val[2]);
tmp0.val[0] = vaddq_s32(a0, a1);
tmp0.val[1] = vaddq_s32(a3, a2);
tmp0.val[2] = vsubq_s32(a3, a2);
tmp0.val[3] = vsubq_s32(a0, a1);
}
{
const int32x4x4_t tmp1 = Transpose4x4_NEON(tmp0);
// a0 = tmp[0 + i] + tmp[ 8 + i]
// a1 = tmp[4 + i] + tmp[12 + i]
// a2 = tmp[4 + i] - tmp[12 + i]
// a3 = tmp[0 + i] - tmp[ 8 + i]
const int32x4_t a0 = vaddq_s32(tmp1.val[0], tmp1.val[2]);
const int32x4_t a1 = vaddq_s32(tmp1.val[1], tmp1.val[3]);
const int32x4_t a2 = vsubq_s32(tmp1.val[1], tmp1.val[3]);
const int32x4_t a3 = vsubq_s32(tmp1.val[0], tmp1.val[2]);
const int32x4_t b0 = vhaddq_s32(a0, a1); // (a0 + a1) >> 1
const int32x4_t b1 = vhaddq_s32(a3, a2); // (a3 + a2) >> 1
const int32x4_t b2 = vhsubq_s32(a3, a2); // (a3 - a2) >> 1
const int32x4_t b3 = vhsubq_s32(a0, a1); // (a0 - a1) >> 1
const int16x4_t out0 = vmovn_s32(b0);
const int16x4_t out1 = vmovn_s32(b1);
const int16x4_t out2 = vmovn_s32(b2);
const int16x4_t out3 = vmovn_s32(b3);
vst1_s16(out + 0, out0);
vst1_s16(out + 4, out1);
vst1_s16(out + 8, out2);
vst1_s16(out + 12, out3);
}
}
#undef LOAD_LANE_16b
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
// a 0123, b 0123
// a 4567, b 4567
// a 89ab, b 89ab
// a cdef, b cdef
//
// transpose
//
// a 048c, b 048c
// a 159d, b 159d
// a 26ae, b 26ae
// a 37bf, b 37bf
//
static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16_NEON(int16x8x4_t q4_in) {
const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
vreinterpretq_s32_s16(q2_tmp1.val[0]));
const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
vreinterpretq_s32_s16(q2_tmp1.val[1]));
q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
return q4_in;
}
static WEBP_INLINE int16x8x4_t DistoHorizontalPass_NEON(
const int16x8x4_t q4_in) {
// {a0, a1} = {in[0] + in[2], in[1] + in[3]}
// {a3, a2} = {in[0] - in[2], in[1] - in[3]}
const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
int16x8x4_t q4_out;
// tmp[0] = a0 + a1
// tmp[1] = a3 + a2
// tmp[2] = a3 - a2
// tmp[3] = a0 - a1
INIT_VECTOR4(q4_out,
vabsq_s16(vaddq_s16(q_a0, q_a1)),
vabsq_s16(vaddq_s16(q_a3, q_a2)),
vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
return q4_out;
}
static WEBP_INLINE int16x8x4_t DistoVerticalPass_NEON(const uint8x8x4_t q4_in) {
const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
q4_in.val[2]));
const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
q4_in.val[3]));
const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
q4_in.val[3]));
const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
q4_in.val[2]));
int16x8x4_t q4_out;
INIT_VECTOR4(q4_out,
vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
return q4_out;
}
static WEBP_INLINE int16x4x4_t DistoLoadW_NEON(const uint16_t* w) {
const uint16x8_t q_w07 = vld1q_u16(&w[0]);
const uint16x8_t q_w8f = vld1q_u16(&w[8]);
int16x4x4_t d4_w;
INIT_VECTOR4(d4_w,
vget_low_s16(vreinterpretq_s16_u16(q_w07)),
vget_high_s16(vreinterpretq_s16_u16(q_w07)),
vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
return d4_w;
}
static WEBP_INLINE int32x2_t DistoSum_NEON(const int16x8x4_t q4_in,
const int16x4x4_t d4_w) {
int32x2_t d_sum;
// sum += w[ 0] * abs(b0);
// sum += w[ 4] * abs(b1);
// sum += w[ 8] * abs(b2);
// sum += w[12] * abs(b3);
int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
q_sum0 = vaddq_s32(q_sum0, q_sum1);
q_sum2 = vaddq_s32(q_sum2, q_sum3);
q_sum2 = vaddq_s32(q_sum0, q_sum2);
d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
d_sum = vpadd_s32(d_sum, d_sum);
return d_sum;
}
#define LOAD_LANE_32b(src, VALUE, LANE) \
(VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int Disto4x4_NEON(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
uint8x8x4_t d4_in;
// load data a, b
LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
INIT_VECTOR4(d4_in,
vreinterpret_u8_u32(d_in_ab_0123),
vreinterpret_u8_u32(d_in_ab_4567),
vreinterpret_u8_u32(d_in_ab_89ab),
vreinterpret_u8_u32(d_in_ab_cdef));
{
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent
// transpose.
const int16x8x4_t q4_v = DistoVerticalPass_NEON(d4_in);
const int16x4x4_t d4_w = DistoLoadW_NEON(w);
// horizontal pass
const int16x8x4_t q4_t = DistoTranspose4x4S16_NEON(q4_v);
const int16x8x4_t q4_h = DistoHorizontalPass_NEON(q4_t);
int32x2_t d_sum = DistoSum_NEON(q4_h, d4_w);
// abs(sum2 - sum1) >> 5
d_sum = vabs_s32(d_sum);
d_sum = vshr_n_s32(d_sum, 5);
return vget_lane_s32(d_sum, 0);
}
}
#undef LOAD_LANE_32b
static int Disto16x16_NEON(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_NEON(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
static void CollectHistogram_NEON(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
FTransform_NEON(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
{
int k;
const int16x8_t a0 = vld1q_s16(out + 0);
const int16x8_t b0 = vld1q_s16(out + 8);
const uint16x8_t a1 = vreinterpretq_u16_s16(vabsq_s16(a0));
const uint16x8_t b1 = vreinterpretq_u16_s16(vabsq_s16(b0));
const uint16x8_t a2 = vshrq_n_u16(a1, 3);
const uint16x8_t b2 = vshrq_n_u16(b1, 3);
const uint16x8_t a3 = vminq_u16(a2, max_coeff_thresh);
const uint16x8_t b3 = vminq_u16(b2, max_coeff_thresh);
vst1q_s16(out + 0, vreinterpretq_s16_u16(a3));
vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
// Convert coefficients to bin.
for (k = 0; k < 16; ++k) {
++distribution[out[k]];
}
}
}
VP8SetHistogramData(distribution, histo);
}
//------------------------------------------------------------------------------
static WEBP_INLINE void AccumulateSSE16_NEON(const uint8_t* const a,
const uint8_t* const b,
uint32x4_t* const sum) {
const uint8x16_t a0 = vld1q_u8(a);
const uint8x16_t b0 = vld1q_u8(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
vget_low_u8(abs_diff));
const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
vget_high_u8(abs_diff));
/* pair-wise adds and widen */
const uint32x4_t sum1 = vpaddlq_u16(prod1);
const uint32x4_t sum2 = vpaddlq_u16(prod2);
*sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
}
// Horizontal sum of all four uint32_t values in 'sum'.
static int SumToInt_NEON(uint32x4_t sum) {
const uint64x2_t sum2 = vpaddlq_u32(sum);
const uint64_t sum3 = vgetq_lane_u64(sum2, 0) + vgetq_lane_u64(sum2, 1);
return (int)sum3;
}
static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 16; ++y) {
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
}
return SumToInt_NEON(sum);
}
static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
AccumulateSSE16_NEON(a + y * BPS, b + y * BPS, &sum);
}
return SumToInt_NEON(sum);
}
static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
uint32x4_t sum = vdupq_n_u32(0);
int y;
for (y = 0; y < 8; ++y) {
const uint8x8_t a0 = vld1_u8(a + y * BPS);
const uint8x8_t b0 = vld1_u8(b + y * BPS);
const uint8x8_t abs_diff = vabd_u8(a0, b0);
const uint16x8_t prod = vmull_u8(abs_diff, abs_diff);
sum = vpadalq_u16(sum, prod);
}
return SumToInt_NEON(sum);
}
static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
const uint8x16_t a0 = Load4x4_NEON(a);
const uint8x16_t b0 = Load4x4_NEON(b);
const uint8x16_t abs_diff = vabdq_u8(a0, b0);
const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
vget_low_u8(abs_diff));
const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
vget_high_u8(abs_diff));
/* pair-wise adds and widen */
const uint32x4_t sum1 = vpaddlq_u16(prod1);
const uint32x4_t sum2 = vpaddlq_u16(prod2);
return SumToInt_NEON(vaddq_u32(sum1, sum2));
}
//------------------------------------------------------------------------------
// Compilation with gcc-4.6.x is problematic for now.
#if !defined(WORK_AROUND_GCC)
static int16x8_t Quantize_NEON(int16_t* const in,
const VP8Matrix* const mtx, int offset) {
const uint16x8_t sharp = vld1q_u16(&mtx->sharpen_[offset]);
const uint16x8_t q = vld1q_u16(&mtx->q_[offset]);
const uint16x8_t iq = vld1q_u16(&mtx->iq_[offset]);
const uint32x4_t bias0 = vld1q_u32(&mtx->bias_[offset + 0]);
const uint32x4_t bias1 = vld1q_u32(&mtx->bias_[offset + 4]);
const int16x8_t a = vld1q_s16(in + offset); // in
const uint16x8_t b = vreinterpretq_u16_s16(vabsq_s16(a)); // coeff = abs(in)
const int16x8_t sign = vshrq_n_s16(a, 15); // sign
const uint16x8_t c = vaddq_u16(b, sharp); // + sharpen
const uint32x4_t m0 = vmull_u16(vget_low_u16(c), vget_low_u16(iq));
const uint32x4_t m1 = vmull_u16(vget_high_u16(c), vget_high_u16(iq));
const uint32x4_t m2 = vhaddq_u32(m0, bias0);
const uint32x4_t m3 = vhaddq_u32(m1, bias1); // (coeff * iQ + bias) >> 1
const uint16x8_t c0 = vcombine_u16(vshrn_n_u32(m2, 16),
vshrn_n_u32(m3, 16)); // QFIX=17 = 16+1
const uint16x8_t c1 = vminq_u16(c0, vdupq_n_u16(MAX_LEVEL));
const int16x8_t c2 = veorq_s16(vreinterpretq_s16_u16(c1), sign);
const int16x8_t c3 = vsubq_s16(c2, sign); // restore sign
const int16x8_t c4 = vmulq_s16(c3, vreinterpretq_s16_u16(q));
vst1q_s16(in + offset, c4);
assert(QFIX == 17); // this function can't work as is if QFIX != 16+1
return c3;
}
static const uint8_t kShuffles[4][8] = {
{ 0, 1, 2, 3, 8, 9, 16, 17 },
{ 10, 11, 4, 5, 6, 7, 12, 13 },
{ 18, 19, 24, 25, 26, 27, 20, 21 },
{ 14, 15, 22, 23, 28, 29, 30, 31 }
};
static int QuantizeBlock_NEON(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
const int16x8_t out0 = Quantize_NEON(in, mtx, 0);
const int16x8_t out1 = Quantize_NEON(in, mtx, 8);
uint8x8x4_t shuffles;
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
uint8x16x2_t all_out;
INIT_VECTOR2(all_out, vreinterpretq_u8_s16(out0), vreinterpretq_u8_s16(out1));
INIT_VECTOR4(shuffles,
vtbl2q_u8(all_out, vld1_u8(kShuffles[0])),
vtbl2q_u8(all_out, vld1_u8(kShuffles[1])),
vtbl2q_u8(all_out, vld1_u8(kShuffles[2])),
vtbl2q_u8(all_out, vld1_u8(kShuffles[3])));
#else
uint8x8x4_t all_out;
INIT_VECTOR4(all_out,
vreinterpret_u8_s16(vget_low_s16(out0)),
vreinterpret_u8_s16(vget_high_s16(out0)),
vreinterpret_u8_s16(vget_low_s16(out1)),
vreinterpret_u8_s16(vget_high_s16(out1)));
INIT_VECTOR4(shuffles,
vtbl4_u8(all_out, vld1_u8(kShuffles[0])),
vtbl4_u8(all_out, vld1_u8(kShuffles[1])),
vtbl4_u8(all_out, vld1_u8(kShuffles[2])),
vtbl4_u8(all_out, vld1_u8(kShuffles[3])));
#endif
// Zigzag reordering
vst1_u8((uint8_t*)(out + 0), shuffles.val[0]);
vst1_u8((uint8_t*)(out + 4), shuffles.val[1]);
vst1_u8((uint8_t*)(out + 8), shuffles.val[2]);
vst1_u8((uint8_t*)(out + 12), shuffles.val[3]);
// test zeros
if (*(uint64_t*)(out + 0) != 0) return 1;
if (*(uint64_t*)(out + 4) != 0) return 1;
if (*(uint64_t*)(out + 8) != 0) return 1;
if (*(uint64_t*)(out + 12) != 0) return 1;
return 0;
}
static int Quantize2Blocks_NEON(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
nz = QuantizeBlock_NEON(in + 0 * 16, out + 0 * 16, mtx) << 0;
nz |= QuantizeBlock_NEON(in + 1 * 16, out + 1 * 16, mtx) << 1;
return nz;
}
#endif // !WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
VP8ITransform = ITransform_NEON;
VP8FTransform = FTransform_NEON;
VP8FTransformWHT = FTransformWHT_NEON;
VP8TDisto4x4 = Disto4x4_NEON;
VP8TDisto16x16 = Disto16x16_NEON;
VP8CollectHistogram = CollectHistogram_NEON;
VP8SSE16x16 = SSE16x16_NEON;
VP8SSE16x8 = SSE16x8_NEON;
VP8SSE8x8 = SSE8x8_NEON;
VP8SSE4x4 = SSE4x4_NEON;
#if !defined(WORK_AROUND_GCC)
VP8EncQuantizeBlock = QuantizeBlock_NEON;
VP8EncQuantize2Blocks = Quantize2Blocks_NEON;
#endif
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
#endif // WEBP_USE_NEON

1381
vendor/github.com/sizeofint/webpanimation/dsp_enc_sse2.c generated vendored Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,339 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE4 version of some encoding functions.
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE41)
#include <smmintrin.h>
#include <stdlib.h> // for abs()
#include "dsp_common_sse2.h"
#include "enc_vp8i_enc.h"
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms.
static void CollectHistogram_SSE41(const uint8_t* ref, const uint8_t* pred,
int start_block, int end_block,
VP8Histogram* const histo) {
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
int distribution[MAX_COEFF_THRESH + 1] = { 0 };
for (j = start_block; j < end_block; ++j) {
int16_t out[16];
int k;
VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
// Convert coefficients to bin (within out[]).
{
// Load.
const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
// v = abs(out) >> 3
const __m128i abs0 = _mm_abs_epi16(out0);
const __m128i abs1 = _mm_abs_epi16(out1);
const __m128i v0 = _mm_srai_epi16(abs0, 3);
const __m128i v1 = _mm_srai_epi16(abs1, 3);
// bin = min(v, MAX_COEFF_THRESH)
const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
// Store.
_mm_storeu_si128((__m128i*)&out[0], bin0);
_mm_storeu_si128((__m128i*)&out[8], bin1);
}
// Convert coefficients to bin.
for (k = 0; k < 16; ++k) {
++distribution[out[k]];
}
}
VP8SetHistogramData(distribution, histo);
}
//------------------------------------------------------------------------------
// Texture distortion
//
// We try to match the spectral content (weighted) between source and
// reconstructed samples.
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
// Load and combine inputs.
{
const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
// In SSE4.1, with gcc 4.8 at least (maybe other versions),
// _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
// of inA and inB, _mm_loadl_epi64 is still used not to have an out of
// bound read.
const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
// Combine inA and inB (we'll do two transforms in parallel).
const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
tmp_0 = _mm_cvtepu8_epi16(inAB_0);
tmp_1 = _mm_cvtepu8_epi16(inAB_1);
tmp_2 = _mm_cvtepu8_epi16(inAB_2);
tmp_3 = _mm_cvtepu8_epi16(inAB_3);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
}
// Vertical pass first to avoid a transpose (vertical and horizontal passes
// are commutative because w/kWeightY is symmetric) and subsequent transpose.
{
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
const __m128i b0 = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
// a00 a01 a02 a03 b00 b01 b02 b03
// a10 a11 a12 a13 b10 b11 b12 b13
// a20 a21 a22 a23 b20 b21 b22 b23
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
}
// Horizontal pass and difference of weighted sums.
{
// Load all inputs.
const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
const __m128i b0 = _mm_add_epi16(a0, a1);
const __m128i b1 = _mm_add_epi16(a3, a2);
const __m128i b2 = _mm_sub_epi16(a3, a2);
const __m128i b3 = _mm_sub_epi16(a0, a1);
// Separate the transforms of inA and inB.
__m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
__m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
__m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
__m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
A_b0 = _mm_abs_epi16(A_b0);
A_b2 = _mm_abs_epi16(A_b2);
B_b0 = _mm_abs_epi16(B_b0);
B_b2 = _mm_abs_epi16(B_b2);
// weighted sums
A_b0 = _mm_madd_epi16(A_b0, w_0);
A_b2 = _mm_madd_epi16(A_b2, w_8);
B_b0 = _mm_madd_epi16(B_b0, w_0);
B_b2 = _mm_madd_epi16(B_b2, w_8);
A_b0 = _mm_add_epi32(A_b0, A_b2);
B_b0 = _mm_add_epi32(B_b0, B_b2);
// difference of weighted sums
A_b2 = _mm_sub_epi32(A_b0, B_b0);
_mm_storeu_si128((__m128i*)&sum[0], A_b2);
}
return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4_SSE41(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
const int diff_sum = TTransform_SSE41(a, b, w);
return abs(diff_sum) >> 5;
}
static int Disto16x16_SSE41(const uint8_t* const a, const uint8_t* const b,
const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
D += Disto4x4_SSE41(a + x + y, b + x + y, w);
}
}
return D;
}
//------------------------------------------------------------------------------
// Quantization
//
// Generates a pshufb constant for shuffling 16b words.
#define PSHUFB_CST(A,B,C,D,E,F,G,H) \
_mm_set_epi8(2 * (H) + 1, 2 * (H) + 0, 2 * (G) + 1, 2 * (G) + 0, \
2 * (F) + 1, 2 * (F) + 0, 2 * (E) + 1, 2 * (E) + 0, \
2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
const uint16_t* const sharpen,
const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i out0, out8;
__m128i packed_out;
// Load all inputs.
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
__m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
// coeff = abs(in)
__m128i coeff0 = _mm_abs_epi16(in0);
__m128i coeff8 = _mm_abs_epi16(in8);
// coeff = abs(in) + sharpen
if (sharpen != NULL) {
const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
coeff0 = _mm_add_epi16(coeff0, sharpen0);
coeff8 = _mm_add_epi16(coeff8, sharpen8);
}
// out = (coeff * iQ + B) >> QFIX
{
// doing calculations with 32b precision (QFIX=17)
// out = (coeff * iQ)
const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
__m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
__m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
// out = (coeff * iQ + B)
const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
out_00 = _mm_add_epi32(out_00, bias_00);
out_04 = _mm_add_epi32(out_04, bias_04);
out_08 = _mm_add_epi32(out_08, bias_08);
out_12 = _mm_add_epi32(out_12, bias_12);
// out = QUANTDIV(coeff, iQ, B, QFIX)
out_00 = _mm_srai_epi32(out_00, QFIX);
out_04 = _mm_srai_epi32(out_04, QFIX);
out_08 = _mm_srai_epi32(out_08, QFIX);
out_12 = _mm_srai_epi32(out_12, QFIX);
// pack result as 16b
out0 = _mm_packs_epi32(out_00, out_04);
out8 = _mm_packs_epi32(out_08, out_12);
// if (coeff > 2047) coeff = 2047
out0 = _mm_min_epi16(out0, max_coeff_2047);
out8 = _mm_min_epi16(out8, max_coeff_2047);
}
// put sign back
out0 = _mm_sign_epi16(out0, in0);
out8 = _mm_sign_epi16(out8, in8);
// in = out * Q
in0 = _mm_mullo_epi16(out0, q0);
in8 = _mm_mullo_epi16(out8, q8);
_mm_storeu_si128((__m128i*)&in[0], in0);
_mm_storeu_si128((__m128i*)&in[8], in8);
// zigzag the output before storing it. The re-ordering is:
// 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15
// -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
// There's only two misplaced entries ([8] and [7]) that are crossing the
// reg's boundaries.
// We use pshufb instead of pshuflo/pshufhi.
{
const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7
const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8
const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
_mm_storeu_si128((__m128i*)&out[0], out_z0);
_mm_storeu_si128((__m128i*)&out[8], out_z8);
packed_out = _mm_packs_epi16(out_z0, out_z8);
}
// detect if all 'out' values are zeroes or not
return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
#undef PSHUFB_CST
static int QuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE41(in, out, &mtx->sharpen_[0], mtx);
}
static int QuantizeBlockWHT_SSE41(int16_t in[16], int16_t out[16],
const VP8Matrix* const mtx) {
return DoQuantizeBlock_SSE41(in, out, NULL, mtx);
}
static int Quantize2Blocks_SSE41(int16_t in[32], int16_t out[32],
const VP8Matrix* const mtx) {
int nz;
const uint16_t* const sharpen = &mtx->sharpen_[0];
nz = DoQuantizeBlock_SSE41(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
nz |= DoQuantizeBlock_SSE41(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
return nz;
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8EncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
VP8CollectHistogram = CollectHistogram_SSE41;
VP8EncQuantizeBlock = QuantizeBlock_SSE41;
VP8EncQuantize2Blocks = Quantize2Blocks_SSE41;
VP8EncQuantizeBlockWHT = QuantizeBlockWHT_SSE41;
VP8TDisto4x4 = Disto4x4_SSE41;
VP8TDisto16x16 = Disto16x16_SSE41;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41)
#endif // WEBP_USE_SSE41

287
vendor/github.com/sizeofint/webpanimation/dsp_filters.c generated vendored Normal file
View File

@ -0,0 +1,287 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Spatial prediction using various filters
//
// Author: Urvang (urvang@google.com)
#include "dsp_dsp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
//------------------------------------------------------------------------------
// Helpful macro.
# define SANITY_CHECK(in, out) \
assert((in) != NULL); \
assert((out) != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void PredictLine_C(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length, int inverse) {
int i;
if (inverse) {
for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] + pred[i]);
} else {
for (i = 0; i < length; ++i) dst[i] = (uint8_t)(src[i] - pred[i]);
}
}
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter_C(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
// Leftmost pixel is predicted from above.
PredictLine_C(in, preds - stride, out, 1, inverse);
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter_C(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
in += stride;
out += stride;
} else {
// We are starting from in-between. Make sure 'preds' points to prev row.
preds -= stride;
}
// Filter line-by-line.
while (row < last_row) {
PredictLine_C(in, preds, out, width, inverse);
++row;
preds += stride;
in += stride;
out += stride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
#if !WEBP_NEON_OMIT_C_CODE
static WEBP_INLINE void DoGradientFilter_C(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
int inverse, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = inverse ? out : in;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLine_C(in + 1, preds, out + 1, width - 1, inverse);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
int w;
// leftmost pixel: predict from above.
PredictLine_C(in, preds - stride, out, 1, inverse);
for (w = 1; w < width; ++w) {
const int pred = GradientPredictor_C(preds[w - 1],
preds[w - stride],
preds[w - stride - 1]);
out[w] = (uint8_t)(in[w] + (inverse ? pred : -pred));
}
++row;
preds += stride;
in += stride;
out += stride;
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef SANITY_CHECK
//------------------------------------------------------------------------------
#if !WEBP_NEON_OMIT_C_CODE
static void HorizontalFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_C(data, width, height, stride, 0, height, 0,
filtered_data);
}
static void VerticalFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
}
static void GradientFilter_C(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_C(data, width, height, stride, 0, height, 0, filtered_data);
}
#endif // !WEBP_NEON_OMIT_C_CODE
//------------------------------------------------------------------------------
static void HorizontalUnfilter_C(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
uint8_t pred = (prev == NULL) ? 0 : prev[0];
int i;
for (i = 0; i < width; ++i) {
out[i] = (uint8_t)(pred + in[i]);
pred = out[i];
}
}
#if !WEBP_NEON_OMIT_C_CODE
static void VerticalUnfilter_C(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_C(NULL, in, out, width);
} else {
int i;
for (i = 0; i < width; ++i) out[i] = (uint8_t)(prev[i] + in[i]);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
static void GradientUnfilter_C(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_C(NULL, in, out, width);
} else {
uint8_t top = prev[0], top_left = top, left = top;
int i;
for (i = 0; i < width; ++i) {
top = prev[i]; // need to read this first, in case prev==out
left = (uint8_t)(in[i] + GradientPredictor_C(left, top, top_left));
top_left = top;
out[i] = left;
}
}
}
//------------------------------------------------------------------------------
// Init function
WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
extern void VP8FiltersInitMIPSdspR2(void);
extern void VP8FiltersInitMSA(void);
extern void VP8FiltersInitNEON(void);
extern void VP8FiltersInitSSE2(void);
WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
WebPUnfilters[WEBP_FILTER_NONE] = NULL;
#if !WEBP_NEON_OMIT_C_CODE
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_C;
#endif
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_C;
WebPFilters[WEBP_FILTER_NONE] = NULL;
#if !WEBP_NEON_OMIT_C_CODE
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_C;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_C;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8FiltersInitSSE2();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8FiltersInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_MSA)
if (VP8GetCPUInfo(kMSA)) {
VP8FiltersInitMSA();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8FiltersInitNEON();
}
#endif
assert(WebPUnfilters[WEBP_FILTER_HORIZONTAL] != NULL);
assert(WebPUnfilters[WEBP_FILTER_VERTICAL] != NULL);
assert(WebPUnfilters[WEBP_FILTER_GRADIENT] != NULL);
assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
}

View File

@ -0,0 +1,402 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Spatial prediction using various filters
//
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "dsp_dsp.h"
#include <assert.h>
#include <stdlib.h>
#include <string.h>
//------------------------------------------------------------------------------
// Helpful macro.
# define SANITY_CHECK(in, out) \
assert(in != NULL); \
assert(out != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
#define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do { \
const uint8_t* psrc = (uint8_t*)(SRC); \
uint8_t* pdst = (uint8_t*)(DST); \
const int ilength = (int)(LENGTH); \
int temp0, temp1, temp2, temp3, temp4, temp5, temp6; \
__asm__ volatile ( \
".set push \n\t" \
".set noreorder \n\t" \
"srl %[temp0], %[length], 2 \n\t" \
"beqz %[temp0], 4f \n\t" \
" andi %[temp6], %[length], 3 \n\t" \
".if " #INVERSE " \n\t" \
"1: \n\t" \
"lbu %[temp1], -1(%[dst]) \n\t" \
"lbu %[temp2], 0(%[src]) \n\t" \
"lbu %[temp3], 1(%[src]) \n\t" \
"lbu %[temp4], 2(%[src]) \n\t" \
"lbu %[temp5], 3(%[src]) \n\t" \
"addu %[temp1], %[temp1], %[temp2] \n\t" \
"addu %[temp2], %[temp1], %[temp3] \n\t" \
"addu %[temp3], %[temp2], %[temp4] \n\t" \
"addu %[temp4], %[temp3], %[temp5] \n\t" \
"sb %[temp1], 0(%[dst]) \n\t" \
"sb %[temp2], 1(%[dst]) \n\t" \
"sb %[temp3], 2(%[dst]) \n\t" \
"sb %[temp4], 3(%[dst]) \n\t" \
"addiu %[src], %[src], 4 \n\t" \
"addiu %[temp0], %[temp0], -1 \n\t" \
"bnez %[temp0], 1b \n\t" \
" addiu %[dst], %[dst], 4 \n\t" \
".else \n\t" \
"1: \n\t" \
"ulw %[temp1], -1(%[src]) \n\t" \
"ulw %[temp2], 0(%[src]) \n\t" \
"addiu %[src], %[src], 4 \n\t" \
"addiu %[temp0], %[temp0], -1 \n\t" \
"subu.qb %[temp3], %[temp2], %[temp1] \n\t" \
"usw %[temp3], 0(%[dst]) \n\t" \
"bnez %[temp0], 1b \n\t" \
" addiu %[dst], %[dst], 4 \n\t" \
".endif \n\t" \
"4: \n\t" \
"beqz %[temp6], 3f \n\t" \
" nop \n\t" \
"2: \n\t" \
"lbu %[temp2], 0(%[src]) \n\t" \
".if " #INVERSE " \n\t" \
"lbu %[temp1], -1(%[dst]) \n\t" \
"addu %[temp3], %[temp1], %[temp2] \n\t" \
".else \n\t" \
"lbu %[temp1], -1(%[src]) \n\t" \
"subu %[temp3], %[temp1], %[temp2] \n\t" \
".endif \n\t" \
"addiu %[src], %[src], 1 \n\t" \
"sb %[temp3], 0(%[dst]) \n\t" \
"addiu %[temp6], %[temp6], -1 \n\t" \
"bnez %[temp6], 2b \n\t" \
" addiu %[dst], %[dst], 1 \n\t" \
"3: \n\t" \
".set pop \n\t" \
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
[temp6]"=&r"(temp6), [dst]"+&r"(pdst), [src]"+&r"(psrc) \
: [length]"r"(ilength) \
: "memory" \
); \
} while (0)
static WEBP_INLINE void PredictLine_MIPSdspR2(const uint8_t* src, uint8_t* dst,
int length) {
DO_PREDICT_LINE(src, dst, length, 0);
}
#define DO_PREDICT_LINE_VERTICAL(SRC, PRED, DST, LENGTH, INVERSE) do { \
const uint8_t* psrc = (uint8_t*)(SRC); \
const uint8_t* ppred = (uint8_t*)(PRED); \
uint8_t* pdst = (uint8_t*)(DST); \
const int ilength = (int)(LENGTH); \
int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
__asm__ volatile ( \
".set push \n\t" \
".set noreorder \n\t" \
"srl %[temp0], %[length], 0x3 \n\t" \
"beqz %[temp0], 4f \n\t" \
" andi %[temp7], %[length], 0x7 \n\t" \
"1: \n\t" \
"ulw %[temp1], 0(%[src]) \n\t" \
"ulw %[temp2], 0(%[pred]) \n\t" \
"ulw %[temp3], 4(%[src]) \n\t" \
"ulw %[temp4], 4(%[pred]) \n\t" \
"addiu %[src], %[src], 8 \n\t" \
".if " #INVERSE " \n\t" \
"addu.qb %[temp5], %[temp1], %[temp2] \n\t" \
"addu.qb %[temp6], %[temp3], %[temp4] \n\t" \
".else \n\t" \
"subu.qb %[temp5], %[temp1], %[temp2] \n\t" \
"subu.qb %[temp6], %[temp3], %[temp4] \n\t" \
".endif \n\t" \
"addiu %[pred], %[pred], 8 \n\t" \
"usw %[temp5], 0(%[dst]) \n\t" \
"usw %[temp6], 4(%[dst]) \n\t" \
"addiu %[temp0], %[temp0], -1 \n\t" \
"bnez %[temp0], 1b \n\t" \
" addiu %[dst], %[dst], 8 \n\t" \
"4: \n\t" \
"beqz %[temp7], 3f \n\t" \
" nop \n\t" \
"2: \n\t" \
"lbu %[temp1], 0(%[src]) \n\t" \
"lbu %[temp2], 0(%[pred]) \n\t" \
"addiu %[src], %[src], 1 \n\t" \
"addiu %[pred], %[pred], 1 \n\t" \
".if " #INVERSE " \n\t" \
"addu %[temp3], %[temp1], %[temp2] \n\t" \
".else \n\t" \
"subu %[temp3], %[temp1], %[temp2] \n\t" \
".endif \n\t" \
"sb %[temp3], 0(%[dst]) \n\t" \
"addiu %[temp7], %[temp7], -1 \n\t" \
"bnez %[temp7], 2b \n\t" \
" addiu %[dst], %[dst], 1 \n\t" \
"3: \n\t" \
".set pop \n\t" \
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [pred]"+&r"(ppred), \
[dst]"+&r"(pdst), [src]"+&r"(psrc) \
: [length]"r"(ilength) \
: "memory" \
); \
} while (0)
#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST) do { \
int temp1, temp2, temp3; \
__asm__ volatile ( \
"lbu %[temp1], 0(%[src]) \n\t" \
"lbu %[temp2], 0(%[pred]) \n\t" \
"subu %[temp3], %[temp1], %[temp2] \n\t" \
"sb %[temp3], 0(%[dst]) \n\t" \
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \
: [pred]"r"((PRED)), [dst]"r"((DST)), [src]"r"((SRC)) \
: "memory" \
); \
} while (0)
//------------------------------------------------------------------------------
// Horizontal filter.
#define FILTER_LINE_BY_LINE do { \
while (row < last_row) { \
PREDICT_LINE_ONE_PASS(in, preds - stride, out); \
DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0); \
++row; \
preds += stride; \
in += stride; \
out += stride; \
} \
} while (0)
static WEBP_INLINE void DoHorizontalFilter_MIPSdspR2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = in;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
FILTER_LINE_BY_LINE;
}
#undef FILTER_LINE_BY_LINE
static void HorizontalFilter_MIPSdspR2(const uint8_t* data,
int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
// Vertical filter.
#define FILTER_LINE_BY_LINE do { \
while (row < last_row) { \
DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0); \
++row; \
preds += stride; \
in += stride; \
out += stride; \
} \
} while (0)
static WEBP_INLINE void DoVerticalFilter_MIPSdspR2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = in;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
} else {
// We are starting from in-between. Make sure 'preds' points to prev row.
preds -= stride;
}
// Filter line-by-line.
FILTER_LINE_BY_LINE;
}
#undef FILTER_LINE_BY_LINE
static void VerticalFilter_MIPSdspR2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
// Gradient filter.
static int GradientPredictor_MIPSdspR2(uint8_t a, uint8_t b, uint8_t c) {
int temp0;
__asm__ volatile (
"addu %[temp0], %[a], %[b] \n\t"
"subu %[temp0], %[temp0], %[c] \n\t"
"shll_s.w %[temp0], %[temp0], 23 \n\t"
"precrqu_s.qb.ph %[temp0], %[temp0], $zero \n\t"
"srl %[temp0], %[temp0], 24 \n\t"
: [temp0]"=&r"(temp0)
: [a]"r"(a),[b]"r"(b),[c]"r"(c)
);
return temp0;
}
#define FILTER_LINE_BY_LINE(PREDS, OPERATION) do { \
while (row < last_row) { \
int w; \
PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \
for (w = 1; w < width; ++w) { \
const int pred = GradientPredictor_MIPSdspR2(PREDS[w - 1], \
PREDS[w - stride], \
PREDS[w - stride - 1]); \
out[w] = in[w] OPERATION pred; \
} \
++row; \
in += stride; \
out += stride; \
} \
} while (0)
static void DoGradientFilter_MIPSdspR2(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows, uint8_t* out) {
const uint8_t* preds;
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
preds = in;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLine_MIPSdspR2(in + 1, out + 1, width - 1);
row = 1;
preds += stride;
in += stride;
out += stride;
}
// Filter line-by-line.
FILTER_LINE_BY_LINE(in, -);
}
#undef FILTER_LINE_BY_LINE
static void GradientFilter_MIPSdspR2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_MIPSdspR2(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
static void HorizontalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
}
static void VerticalUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
} else {
DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
}
}
static void GradientUnfilter_MIPSdspR2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_MIPSdspR2(NULL, in, out, width);
} else {
uint8_t top = prev[0], top_left = top, left = top;
int i;
for (i = 0; i < width; ++i) {
top = prev[i]; // need to read this first, in case prev==dst
left = in[i] + GradientPredictor_MIPSdspR2(left, top, top_left);
top_left = top;
out[i] = left;
}
}
}
#undef DO_PREDICT_LINE_VERTICAL
#undef PREDICT_LINE_ONE_PASS
#undef DO_PREDICT_LINE
#undef SANITY_CHECK
//------------------------------------------------------------------------------
// Entry point
extern void VP8FiltersInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_MIPSdspR2;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_MIPSdspR2;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_MIPSdspR2;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MIPSdspR2;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MIPSdspR2;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8FiltersInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View File

@ -0,0 +1,202 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MSA variant of alpha filters
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MSA)
#include "dsp_msa_macro.h"
#include <assert.h>
static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
const uint8_t* pred,
uint8_t* dst, int length) {
v16u8 src0, pred0, dst0;
assert(length >= 0);
while (length >= 32) {
v16u8 src1, pred1, dst1;
LD_UB2(src, 16, src0, src1);
LD_UB2(pred, 16, pred0, pred1);
SUB2(src0, pred0, src1, pred1, dst0, dst1);
ST_UB2(dst0, dst1, dst, 16);
src += 32;
pred += 32;
dst += 32;
length -= 32;
}
if (length > 0) {
int i;
if (length >= 16) {
src0 = LD_UB(src);
pred0 = LD_UB(pred);
dst0 = src0 - pred0;
ST_UB(dst0, dst);
src += 16;
pred += 16;
dst += 16;
length -= 16;
}
for (i = 0; i < length; i++) {
dst[i] = src[i] - pred[i];
}
}
}
//------------------------------------------------------------------------------
// Helpful macro.
#define SANITY_CHECK(in, out) \
assert(in != NULL); \
assert(out != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width);
//------------------------------------------------------------------------------
// Horrizontal filter
static void HorizontalFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
const uint8_t* preds = data;
const uint8_t* in = data;
uint8_t* out = filtered_data;
int row = 1;
SANITY_CHECK(in, out);
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLineInverse0(in + 1, preds, out + 1, width - 1);
preds += stride;
in += stride;
out += stride;
// Filter line-by-line.
while (row < height) {
// Leftmost pixel is predicted from above.
PredictLineInverse0(in, preds - stride, out, 1);
PredictLineInverse0(in + 1, preds, out + 1, width - 1);
++row;
preds += stride;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Gradient filter
static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
const uint8_t* ppred,
uint8_t* poutput, int stride,
int size) {
int w;
const v16i8 zero = { 0 };
while (size >= 16) {
v16u8 pred0, dst0;
v8i16 a0, a1, b0, b1, c0, c1;
const v16u8 tmp0 = LD_UB(ppred - 1);
const v16u8 tmp1 = LD_UB(ppred - stride);
const v16u8 tmp2 = LD_UB(ppred - stride - 1);
const v16u8 src0 = LD_UB(pinput);
ILVRL_B2_SH(zero, tmp0, a0, a1);
ILVRL_B2_SH(zero, tmp1, b0, b1);
ILVRL_B2_SH(zero, tmp2, c0, c1);
ADD2(a0, b0, a1, b1, a0, a1);
SUB2(a0, c0, a1, c1, a0, a1);
CLIP_SH2_0_255(a0, a1);
pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);
dst0 = src0 - pred0;
ST_UB(dst0, poutput);
ppred += 16;
pinput += 16;
poutput += 16;
size -= 16;
}
for (w = 0; w < size; ++w) {
const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1];
poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred);
}
}
static void GradientFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
const uint8_t* in = data;
const uint8_t* preds = data;
uint8_t* out = filtered_data;
int row = 1;
SANITY_CHECK(in, out);
// left prediction for top scan-line
out[0] = in[0];
PredictLineInverse0(in + 1, preds, out + 1, width - 1);
preds += stride;
in += stride;
out += stride;
// Filter line-by-line.
while (row < height) {
out[0] = in[0] - preds[- stride];
PredictLineGradient(preds + 1, in + 1, out + 1, stride, width - 1);
++row;
preds += stride;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Vertical filter
static void VerticalFilter_MSA(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
const uint8_t* in = data;
const uint8_t* preds = data;
uint8_t* out = filtered_data;
int row = 1;
SANITY_CHECK(in, out);
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLineInverse0(in + 1, preds, out + 1, width - 1);
in += stride;
out += stride;
// Filter line-by-line.
while (row < height) {
PredictLineInverse0(in, preds, out, width);
++row;
preds += stride;
in += stride;
out += stride;
}
}
#undef SANITY_CHECK
//------------------------------------------------------------------------------
// Entry point
extern void VP8FiltersInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_MSA;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_MSA;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_MSA;
}
#else // !WEBP_USE_MSA
WEBP_DSP_INIT_STUB(VP8FiltersInitMSA)
#endif // WEBP_USE_MSA

View File

@ -0,0 +1,329 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON variant of alpha filters
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include "dsp_neon.h"
//------------------------------------------------------------------------------
// Helpful macros.
# define SANITY_CHECK(in, out) \
assert(in != NULL); \
assert(out != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
// load eight u8 and widen to s16
#define U8_TO_S16(A) vreinterpretq_s16_u16(vmovl_u8(A))
#define LOAD_U8_TO_S16(A) U8_TO_S16(vld1_u8(A))
// shift left or right by N byte, inserting zeros
#define SHIFT_RIGHT_N_Q(A, N) vextq_u8((A), zero, (N))
#define SHIFT_LEFT_N_Q(A, N) vextq_u8(zero, (A), (16 - (N)) % 16)
// rotate left by N bytes
#define ROTATE_LEFT_N(A, N) vext_u8((A), (A), (N))
// rotate right by N bytes
#define ROTATE_RIGHT_N(A, N) vext_u8((A), (A), (8 - (N)) % 8)
static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length) {
int i;
assert(length >= 0);
for (i = 0; i + 16 <= length; i += 16) {
const uint8x16_t A = vld1q_u8(&src[i]);
const uint8x16_t B = vld1q_u8(&pred[i]);
const uint8x16_t C = vsubq_u8(A, B);
vst1q_u8(&dst[i], C);
}
for (; i < length; ++i) dst[i] = src[i] - pred[i];
}
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) {
PredictLine_NEON(src, src - 1, dst, length);
}
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
// Leftmost pixel is predicted from above.
out[0] = in[0] - in[-stride];
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
++row;
in += stride;
out += stride;
}
}
static void HorizontalFilter_NEON(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
PredictLine_NEON(in, in - stride, out, width);
++row;
in += stride;
out += stride;
}
}
static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
}
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
static void GradientPredictDirect_NEON(const uint8_t* const row,
const uint8_t* const top,
uint8_t* const out, int length) {
int i;
for (i = 0; i + 8 <= length; i += 8) {
const uint8x8_t A = vld1_u8(&row[i - 1]);
const uint8x8_t B = vld1_u8(&top[i + 0]);
const int16x8_t C = vreinterpretq_s16_u16(vaddl_u8(A, B));
const int16x8_t D = LOAD_U8_TO_S16(&top[i - 1]);
const uint8x8_t E = vqmovun_s16(vsubq_s16(C, D));
const uint8x8_t F = vld1_u8(&row[i + 0]);
vst1_u8(&out[i], vsub_u8(F, E));
}
for (; i < length; ++i) {
out[i] = row[i] - GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
}
}
static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLineLeft_NEON(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
out[0] = in[0] - in[-stride];
GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1);
++row;
in += stride;
out += stride;
}
}
static void GradientFilter_NEON(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_NEON(data, width, height, stride, 0, height,
filtered_data);
}
#undef SANITY_CHECK
//------------------------------------------------------------------------------
// Inverse transforms
static void HorizontalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
int i;
const uint8x16_t zero = vdupq_n_u8(0);
uint8x16_t last;
out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
if (width <= 1) return;
last = vsetq_lane_u8(out[0], zero, 0);
for (i = 1; i + 16 <= width; i += 16) {
const uint8x16_t A0 = vld1q_u8(&in[i]);
const uint8x16_t A1 = vaddq_u8(A0, last);
const uint8x16_t A2 = SHIFT_LEFT_N_Q(A1, 1);
const uint8x16_t A3 = vaddq_u8(A1, A2);
const uint8x16_t A4 = SHIFT_LEFT_N_Q(A3, 2);
const uint8x16_t A5 = vaddq_u8(A3, A4);
const uint8x16_t A6 = SHIFT_LEFT_N_Q(A5, 4);
const uint8x16_t A7 = vaddq_u8(A5, A6);
const uint8x16_t A8 = SHIFT_LEFT_N_Q(A7, 8);
const uint8x16_t A9 = vaddq_u8(A7, A8);
vst1q_u8(&out[i], A9);
last = SHIFT_RIGHT_N_Q(A9, 15);
}
for (; i < width; ++i) out[i] = in[i] + out[i - 1];
}
static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_NEON(NULL, in, out, width);
} else {
int i;
assert(width >= 0);
for (i = 0; i + 16 <= width; i += 16) {
const uint8x16_t A = vld1q_u8(&in[i]);
const uint8x16_t B = vld1q_u8(&prev[i]);
const uint8x16_t C = vaddq_u8(A, B);
vst1q_u8(&out[i], C);
}
for (; i < width; ++i) out[i] = in[i] + prev[i];
}
}
// GradientUnfilter_NEON is correct but slower than the C-version,
// at least on ARM64. For armv7, it's a wash.
// So best is to disable it for now, but keep the idea around...
#if !defined(USE_GRADIENT_UNFILTER)
#define USE_GRADIENT_UNFILTER 0 // ALTERNATE_CODE
#endif
#if (USE_GRADIENT_UNFILTER == 1)
#define GRAD_PROCESS_LANE(L) do { \
const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1); /* rotate predictor in */ \
const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1)); \
const uint8x8_t delta = vqmovun_s16(tmp2); \
pred = vadd_u8(D, delta); \
out = vext_u8(out, ROTATE_LEFT_N(pred, (L)), 1); \
} while (0)
static void GradientPredictInverse_NEON(const uint8_t* const in,
const uint8_t* const top,
uint8_t* const row, int length) {
if (length > 0) {
int i;
uint8x8_t pred = vdup_n_u8(row[-1]); // left sample
uint8x8_t out = vdup_n_u8(0);
for (i = 0; i + 8 <= length; i += 8) {
const int16x8_t B = LOAD_U8_TO_S16(&top[i + 0]);
const int16x8_t C = LOAD_U8_TO_S16(&top[i - 1]);
const int16x8_t BC = vsubq_s16(B, C); // unclipped gradient basis B - C
const uint8x8_t D = vld1_u8(&in[i]); // base input
GRAD_PROCESS_LANE(0);
GRAD_PROCESS_LANE(1);
GRAD_PROCESS_LANE(2);
GRAD_PROCESS_LANE(3);
GRAD_PROCESS_LANE(4);
GRAD_PROCESS_LANE(5);
GRAD_PROCESS_LANE(6);
GRAD_PROCESS_LANE(7);
vst1_u8(&row[i], out);
}
for (; i < length; ++i) {
row[i] = in[i] + GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
}
}
}
#undef GRAD_PROCESS_LANE
static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_NEON(NULL, in, out, width);
} else {
out[0] = in[0] + prev[0]; // predict from above
GradientPredictInverse_NEON(in + 1, prev + 1, out + 1, width - 1);
}
}
#endif // USE_GRADIENT_UNFILTER
//------------------------------------------------------------------------------
// Entry point
extern void VP8FiltersInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
#if (USE_GRADIENT_UNFILTER == 1)
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
#endif
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_NEON;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_NEON;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8FiltersInitNEON)
#endif // WEBP_USE_NEON

View File

@ -0,0 +1,335 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 variant of alpha filters
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include <stdlib.h>
#include <string.h>
//------------------------------------------------------------------------------
// Helpful macro.
# define SANITY_CHECK(in, out) \
assert((in) != NULL); \
assert((out) != NULL); \
assert(width > 0); \
assert(height > 0); \
assert(stride >= width); \
assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \
(void)height; // Silence unused warning.
static void PredictLineTop_SSE2(const uint8_t* src, const uint8_t* pred,
uint8_t* dst, int length) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
for (i = 0; i < max_pos; i += 32) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]);
const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]);
const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]);
const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]);
const __m128i C0 = _mm_sub_epi8(A0, B0);
const __m128i C1 = _mm_sub_epi8(A1, B1);
_mm_storeu_si128((__m128i*)&dst[i + 0], C0);
_mm_storeu_si128((__m128i*)&dst[i + 16], C1);
}
for (; i < length; ++i) dst[i] = src[i] - pred[i];
}
// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
static void PredictLineLeft_SSE2(const uint8_t* src, uint8_t* dst, int length) {
int i;
const int max_pos = length & ~31;
assert(length >= 0);
for (i = 0; i < max_pos; i += 32) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i + 0 ));
const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i + 0 - 1));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16 ));
const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1));
const __m128i C0 = _mm_sub_epi8(A0, B0);
const __m128i C1 = _mm_sub_epi8(A1, B1);
_mm_storeu_si128((__m128i*)(dst + i + 0), C0);
_mm_storeu_si128((__m128i*)(dst + i + 16), C1);
}
for (; i < length; ++i) dst[i] = src[i] - src[i - 1];
}
//------------------------------------------------------------------------------
// Horizontal filter.
static WEBP_INLINE void DoHorizontalFilter_SSE2(const uint8_t* in,
int width, int height,
int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) {
// Leftmost pixel is the same as input for topmost scanline.
out[0] = in[0];
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
// Leftmost pixel is predicted from above.
out[0] = in[0] - in[-stride];
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
++row;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Vertical filter.
static WEBP_INLINE void DoVerticalFilter_SSE2(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
if (row == 0) {
// Very first top-left pixel is copied.
out[0] = in[0];
// Rest of top scan-line is left-predicted.
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
PredictLineTop_SSE2(in, in - stride, out, width);
++row;
in += stride;
out += stride;
}
}
//------------------------------------------------------------------------------
// Gradient filter.
static WEBP_INLINE int GradientPredictor_SSE2(uint8_t a, uint8_t b, uint8_t c) {
const int g = a + b - c;
return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255; // clip to 8bit
}
static void GradientPredictDirect_SSE2(const uint8_t* const row,
const uint8_t* const top,
uint8_t* const out, int length) {
const int max_pos = length & ~7;
int i;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i < max_pos; i += 8) {
const __m128i A0 = _mm_loadl_epi64((const __m128i*)&row[i - 1]);
const __m128i B0 = _mm_loadl_epi64((const __m128i*)&top[i]);
const __m128i C0 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
const __m128i D = _mm_loadl_epi64((const __m128i*)&row[i]);
const __m128i A1 = _mm_unpacklo_epi8(A0, zero);
const __m128i B1 = _mm_unpacklo_epi8(B0, zero);
const __m128i C1 = _mm_unpacklo_epi8(C0, zero);
const __m128i E = _mm_add_epi16(A1, B1);
const __m128i F = _mm_sub_epi16(E, C1);
const __m128i G = _mm_packus_epi16(F, zero);
const __m128i H = _mm_sub_epi8(D, G);
_mm_storel_epi64((__m128i*)(out + i), H);
}
for (; i < length; ++i) {
const int delta = GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
out[i] = (uint8_t)(row[i] - delta);
}
}
static WEBP_INLINE void DoGradientFilter_SSE2(const uint8_t* in,
int width, int height, int stride,
int row, int num_rows,
uint8_t* out) {
const size_t start_offset = row * stride;
const int last_row = row + num_rows;
SANITY_CHECK(in, out);
in += start_offset;
out += start_offset;
// left prediction for top scan-line
if (row == 0) {
out[0] = in[0];
PredictLineLeft_SSE2(in + 1, out + 1, width - 1);
row = 1;
in += stride;
out += stride;
}
// Filter line-by-line.
while (row < last_row) {
out[0] = (uint8_t)(in[0] - in[-stride]);
GradientPredictDirect_SSE2(in + 1, in + 1 - stride, out + 1, width - 1);
++row;
in += stride;
out += stride;
}
}
#undef SANITY_CHECK
//------------------------------------------------------------------------------
static void HorizontalFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoHorizontalFilter_SSE2(data, width, height, stride, 0, height,
filtered_data);
}
static void VerticalFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoVerticalFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
}
static void GradientFilter_SSE2(const uint8_t* data, int width, int height,
int stride, uint8_t* filtered_data) {
DoGradientFilter_SSE2(data, width, height, stride, 0, height, filtered_data);
}
//------------------------------------------------------------------------------
// Inverse transforms
static void HorizontalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
int i;
__m128i last;
out[0] = (uint8_t)(in[0] + (prev == NULL ? 0 : prev[0]));
if (width <= 1) return;
last = _mm_set_epi32(0, 0, 0, out[0]);
for (i = 1; i + 8 <= width; i += 8) {
const __m128i A0 = _mm_loadl_epi64((const __m128i*)(in + i));
const __m128i A1 = _mm_add_epi8(A0, last);
const __m128i A2 = _mm_slli_si128(A1, 1);
const __m128i A3 = _mm_add_epi8(A1, A2);
const __m128i A4 = _mm_slli_si128(A3, 2);
const __m128i A5 = _mm_add_epi8(A3, A4);
const __m128i A6 = _mm_slli_si128(A5, 4);
const __m128i A7 = _mm_add_epi8(A5, A6);
_mm_storel_epi64((__m128i*)(out + i), A7);
last = _mm_srli_epi64(A7, 56);
}
for (; i < width; ++i) out[i] = (uint8_t)(in[i] + out[i - 1]);
}
static void VerticalUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_SSE2(NULL, in, out, width);
} else {
int i;
const int max_pos = width & ~31;
assert(width >= 0);
for (i = 0; i < max_pos; i += 32) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)&in[i + 0]);
const __m128i A1 = _mm_loadu_si128((const __m128i*)&in[i + 16]);
const __m128i B0 = _mm_loadu_si128((const __m128i*)&prev[i + 0]);
const __m128i B1 = _mm_loadu_si128((const __m128i*)&prev[i + 16]);
const __m128i C0 = _mm_add_epi8(A0, B0);
const __m128i C1 = _mm_add_epi8(A1, B1);
_mm_storeu_si128((__m128i*)&out[i + 0], C0);
_mm_storeu_si128((__m128i*)&out[i + 16], C1);
}
for (; i < width; ++i) out[i] = (uint8_t)(in[i] + prev[i]);
}
}
static void GradientPredictInverse_SSE2(const uint8_t* const in,
const uint8_t* const top,
uint8_t* const row, int length) {
if (length > 0) {
int i;
const int max_pos = length & ~7;
const __m128i zero = _mm_setzero_si128();
__m128i A = _mm_set_epi32(0, 0, 0, row[-1]); // left sample
for (i = 0; i < max_pos; i += 8) {
const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]);
const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
const __m128i B = _mm_unpacklo_epi8(tmp0, zero);
const __m128i C = _mm_unpacklo_epi8(tmp1, zero);
const __m128i D = _mm_loadl_epi64((const __m128i*)&in[i]); // base input
const __m128i E = _mm_sub_epi16(B, C); // unclipped gradient basis B - C
__m128i out = zero; // accumulator for output
__m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff);
int k = 8;
while (1) {
const __m128i tmp3 = _mm_add_epi16(A, E); // delta = A + B - C
const __m128i tmp4 = _mm_packus_epi16(tmp3, zero); // saturate delta
const __m128i tmp5 = _mm_add_epi8(tmp4, D); // add to in[]
A = _mm_and_si128(tmp5, mask_hi); // 1-complement clip
out = _mm_or_si128(out, A); // accumulate output
if (--k == 0) break;
A = _mm_slli_si128(A, 1); // rotate left sample
mask_hi = _mm_slli_si128(mask_hi, 1); // rotate mask
A = _mm_unpacklo_epi8(A, zero); // convert 8b->16b
}
A = _mm_srli_si128(A, 7); // prepare left sample for next iteration
_mm_storel_epi64((__m128i*)&row[i], out);
}
for (; i < length; ++i) {
const int delta = GradientPredictor_SSE2(row[i - 1], top[i], top[i - 1]);
row[i] = (uint8_t)(in[i] + delta);
}
}
}
static void GradientUnfilter_SSE2(const uint8_t* prev, const uint8_t* in,
uint8_t* out, int width) {
if (prev == NULL) {
HorizontalUnfilter_SSE2(NULL, in, out, width);
} else {
out[0] = (uint8_t)(in[0] + prev[0]); // predict from above
GradientPredictInverse_SSE2(in + 1, prev + 1, out + 1, width - 1);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8FiltersInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_SSE2;
WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_SSE2;
WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_SSE2;
WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_SSE2;
WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_SSE2;
WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8FiltersInitSSE2)
#endif // WEBP_USE_SSE2

View File

@ -0,0 +1,660 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transforms and color space conversion methods for lossless decoder.
//
// Authors: Vikas Arora (vikaas.arora@gmail.com)
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
#include "dsp_dsp.h"
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include "dec_vp8li_dec.h"
#include "utils_endian_inl_utils.h"
#include "dsp_lossless.h"
#include "dsp_lossless_common.h"
//------------------------------------------------------------------------------
// Image transforms.
static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
}
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
return Average2(Average2(a0, a2), a1);
}
static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
uint32_t a2, uint32_t a3) {
return Average2(Average2(a0, a1), Average2(a2, a3));
}
static WEBP_INLINE uint32_t Clip255(uint32_t a) {
if (a < 256) {
return a;
}
// return 0, when a is a negative integer.
// return 255, when a is positive.
return ~a >> 24;
}
static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
return Clip255(a + b - c);
}
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
uint32_t c2) {
const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
(c1 >> 16) & 0xff,
(c2 >> 16) & 0xff);
const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
(c1 >> 8) & 0xff,
(c2 >> 8) & 0xff);
const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
return Clip255(a + (a - b) / 2);
}
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
uint32_t c2) {
const uint32_t ave = Average2(c0, c1);
const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
}
// gcc <= 4.9 on ARM generates incorrect code in Select() when Sub3() is
// inlined.
#if defined(__arm__) && defined(__GNUC__) && LOCAL_GCC_VERSION <= 0x409
# define LOCAL_INLINE __attribute__ ((noinline))
#else
# define LOCAL_INLINE WEBP_INLINE
#endif
static LOCAL_INLINE int Sub3(int a, int b, int c) {
const int pb = b - c;
const int pa = a - c;
return abs(pb) - abs(pa);
}
#undef LOCAL_INLINE
static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
const int pa_minus_pb =
Sub3((a >> 24) , (b >> 24) , (c >> 24) ) +
Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
Sub3((a >> 8) & 0xff, (b >> 8) & 0xff, (c >> 8) & 0xff) +
Sub3((a ) & 0xff, (b ) & 0xff, (c ) & 0xff);
return (pa_minus_pb <= 0) ? a : b;
}
//------------------------------------------------------------------------------
// Predictors
uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top) {
(void)top;
(void)left;
return ARGB_BLACK;
}
uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top) {
(void)top;
return left;
}
uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[0];
}
uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[1];
}
uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top) {
(void)left;
return top[-1];
}
uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average3(left, top[0], top[1]);
return pred;
}
uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[-1]);
return pred;
}
uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(left, top[0]);
return pred;
}
uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[-1], top[0]);
(void)left;
return pred;
}
uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2(top[0], top[1]);
(void)left;
return pred;
}
uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
return pred;
}
uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Select(top[0], left, top[-1]);
return pred;
}
uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
return pred;
}
uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
return pred;
}
static void PredictorAdd0_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int x;
(void)upper;
for (x = 0; x < num_pixels; ++x) out[x] = VP8LAddPixels(in[x], ARGB_BLACK);
}
static void PredictorAdd1_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint32_t left = out[-1];
(void)upper;
for (i = 0; i < num_pixels; ++i) {
out[i] = left = VP8LAddPixels(in[i], left);
}
}
GENERATE_PREDICTOR_ADD(VP8LPredictor2_C, PredictorAdd2_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor3_C, PredictorAdd3_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor4_C, PredictorAdd4_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor5_C, PredictorAdd5_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor6_C, PredictorAdd6_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor7_C, PredictorAdd7_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor8_C, PredictorAdd8_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor9_C, PredictorAdd9_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor10_C, PredictorAdd10_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor11_C, PredictorAdd11_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor12_C, PredictorAdd12_C)
GENERATE_PREDICTOR_ADD(VP8LPredictor13_C, PredictorAdd13_C)
//------------------------------------------------------------------------------
// Inverse prediction.
static void PredictorInverseTransform_C(const VP8LTransform* const transform,
int y_start, int y_end,
const uint32_t* in, uint32_t* out) {
const int width = transform->xsize_;
if (y_start == 0) { // First Row follows the L (mode=1) mode.
PredictorAdd0_C(in, NULL, 1, out);
PredictorAdd1_C(in + 1, NULL, width - 1, out + 1);
in += width;
out += width;
++y_start;
}
{
int y = y_start;
const int tile_width = 1 << transform->bits_;
const int mask = tile_width - 1;
const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
const uint32_t* pred_mode_base =
transform->data_ + (y >> transform->bits_) * tiles_per_row;
while (y < y_end) {
const uint32_t* pred_mode_src = pred_mode_base;
int x = 1;
// First pixel follows the T (mode=2) mode.
PredictorAdd2_C(in, out - width, 1, out);
// .. the rest:
while (x < width) {
const VP8LPredictorAddSubFunc pred_func =
VP8LPredictorsAdd[((*pred_mode_src++) >> 8) & 0xf];
int x_end = (x & ~mask) + tile_width;
if (x_end > width) x_end = width;
pred_func(in + x, out + x - width, x_end - x, out + x);
x = x_end;
}
in += width;
out += width;
++y;
if ((y & mask) == 0) { // Use the same mask, since tiles are squares.
pred_mode_base += tiles_per_row;
}
}
}
}
// Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green').
void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
uint32_t* dst) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = src[i];
const uint32_t green = ((argb >> 8) & 0xff);
uint32_t red_blue = (argb & 0x00ff00ffu);
red_blue += (green << 16) | green;
red_blue &= 0x00ff00ffu;
dst[i] = (argb & 0xff00ff00u) | red_blue;
}
}
static WEBP_INLINE int ColorTransformDelta(int8_t color_pred,
int8_t color) {
return ((int)color_pred * color) >> 5;
}
static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
VP8LMultipliers* const m) {
m->green_to_red_ = (color_code >> 0) & 0xff;
m->green_to_blue_ = (color_code >> 8) & 0xff;
m->red_to_blue_ = (color_code >> 16) & 0xff;
}
void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = src[i];
const int8_t green = (int8_t)(argb >> 8);
const uint32_t red = argb >> 16;
int new_red = red & 0xff;
int new_blue = argb & 0xff;
new_red += ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue += ColorTransformDelta(m->green_to_blue_, green);
new_blue += ColorTransformDelta(m->red_to_blue_, (int8_t)new_red);
new_blue &= 0xff;
dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
}
// Color space inverse transform.
static void ColorSpaceInverseTransform_C(const VP8LTransform* const transform,
int y_start, int y_end,
const uint32_t* src, uint32_t* dst) {
const int width = transform->xsize_;
const int tile_width = 1 << transform->bits_;
const int mask = tile_width - 1;
const int safe_width = width & ~mask;
const int remaining_width = width - safe_width;
const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
int y = y_start;
const uint32_t* pred_row =
transform->data_ + (y >> transform->bits_) * tiles_per_row;
while (y < y_end) {
const uint32_t* pred = pred_row;
VP8LMultipliers m = { 0, 0, 0 };
const uint32_t* const src_safe_end = src + safe_width;
const uint32_t* const src_end = src + width;
while (src < src_safe_end) {
ColorCodeToMultipliers(*pred++, &m);
VP8LTransformColorInverse(&m, src, tile_width, dst);
src += tile_width;
dst += tile_width;
}
if (src < src_end) { // Left-overs using C-version.
ColorCodeToMultipliers(*pred++, &m);
VP8LTransformColorInverse(&m, src, remaining_width, dst);
src += remaining_width;
dst += remaining_width;
}
++y;
if ((y & mask) == 0) pred_row += tiles_per_row;
}
}
// Separate out pixels packed together using pixel-bundling.
// We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
#define COLOR_INDEX_INVERSE(FUNC_NAME, F_NAME, STATIC_DECL, TYPE, BIT_SUFFIX, \
GET_INDEX, GET_VALUE) \
static void F_NAME(const TYPE* src, const uint32_t* const color_map, \
TYPE* dst, int y_start, int y_end, int width) { \
int y; \
for (y = y_start; y < y_end; ++y) { \
int x; \
for (x = 0; x < width; ++x) { \
*dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]); \
} \
} \
} \
STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform, \
int y_start, int y_end, const TYPE* src, \
TYPE* dst) { \
int y; \
const int bits_per_pixel = 8 >> transform->bits_; \
const int width = transform->xsize_; \
const uint32_t* const color_map = transform->data_; \
if (bits_per_pixel < 8) { \
const int pixels_per_byte = 1 << transform->bits_; \
const int count_mask = pixels_per_byte - 1; \
const uint32_t bit_mask = (1 << bits_per_pixel) - 1; \
for (y = y_start; y < y_end; ++y) { \
uint32_t packed_pixels = 0; \
int x; \
for (x = 0; x < width; ++x) { \
/* We need to load fresh 'packed_pixels' once every */ \
/* 'pixels_per_byte' increments of x. Fortunately, pixels_per_byte */ \
/* is a power of 2, so can just use a mask for that, instead of */ \
/* decrementing a counter. */ \
if ((x & count_mask) == 0) packed_pixels = GET_INDEX(*src++); \
*dst++ = GET_VALUE(color_map[packed_pixels & bit_mask]); \
packed_pixels >>= bits_per_pixel; \
} \
} \
} else { \
VP8LMapColor##BIT_SUFFIX(src, color_map, dst, y_start, y_end, width); \
} \
}
COLOR_INDEX_INVERSE(ColorIndexInverseTransform_C, MapARGB_C, static,
uint32_t, 32b, VP8GetARGBIndex, VP8GetARGBValue)
COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha_C, ,
uint8_t, 8b, VP8GetAlphaIndex, VP8GetAlphaValue)
#undef COLOR_INDEX_INVERSE
void VP8LInverseTransform(const VP8LTransform* const transform,
int row_start, int row_end,
const uint32_t* const in, uint32_t* const out) {
const int width = transform->xsize_;
assert(row_start < row_end);
assert(row_end <= transform->ysize_);
switch (transform->type_) {
case SUBTRACT_GREEN:
VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
break;
case PREDICTOR_TRANSFORM:
PredictorInverseTransform_C(transform, row_start, row_end, in, out);
if (row_end != transform->ysize_) {
// The last predicted row in this iteration will be the top-pred row
// for the first row in next iteration.
memcpy(out - width, out + (row_end - row_start - 1) * width,
width * sizeof(*out));
}
break;
case CROSS_COLOR_TRANSFORM:
ColorSpaceInverseTransform_C(transform, row_start, row_end, in, out);
break;
case COLOR_INDEXING_TRANSFORM:
if (in == out && transform->bits_ > 0) {
// Move packed pixels to the end of unpacked region, so that unpacking
// can occur seamlessly.
// Also, note that this is the only transform that applies on
// the effective width of VP8LSubSampleSize(xsize_, bits_). All other
// transforms work on effective width of xsize_.
const int out_stride = (row_end - row_start) * width;
const int in_stride = (row_end - row_start) *
VP8LSubSampleSize(transform->xsize_, transform->bits_);
uint32_t* const src = out + out_stride - in_stride;
memmove(src, out, in_stride * sizeof(*src));
ColorIndexInverseTransform_C(transform, row_start, row_end, src, out);
} else {
ColorIndexInverseTransform_C(transform, row_start, row_end, in, out);
}
break;
}
}
//------------------------------------------------------------------------------
// Color space conversion.
static int is_big_endian(void) {
static const union {
uint16_t w;
uint8_t b[2];
} tmp = { 1 };
return (tmp.b[0] != 1);
}
void VP8LConvertBGRAToRGB_C(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
*dst++ = (argb >> 16) & 0xff;
*dst++ = (argb >> 8) & 0xff;
*dst++ = (argb >> 0) & 0xff;
}
}
void VP8LConvertBGRAToRGBA_C(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
*dst++ = (argb >> 16) & 0xff;
*dst++ = (argb >> 8) & 0xff;
*dst++ = (argb >> 0) & 0xff;
*dst++ = (argb >> 24) & 0xff;
}
}
void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
const uint8_t rg = ((argb >> 16) & 0xf0) | ((argb >> 12) & 0xf);
const uint8_t ba = ((argb >> 0) & 0xf0) | ((argb >> 28) & 0xf);
#if (WEBP_SWAP_16BIT_CSP == 1)
*dst++ = ba;
*dst++ = rg;
#else
*dst++ = rg;
*dst++ = ba;
#endif
}
}
void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
const uint8_t rg = ((argb >> 16) & 0xf8) | ((argb >> 13) & 0x7);
const uint8_t gb = ((argb >> 5) & 0xe0) | ((argb >> 3) & 0x1f);
#if (WEBP_SWAP_16BIT_CSP == 1)
*dst++ = gb;
*dst++ = rg;
#else
*dst++ = rg;
*dst++ = gb;
#endif
}
}
void VP8LConvertBGRAToBGR_C(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
*dst++ = (argb >> 0) & 0xff;
*dst++ = (argb >> 8) & 0xff;
*dst++ = (argb >> 16) & 0xff;
}
}
static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
int swap_on_big_endian) {
if (is_big_endian() == swap_on_big_endian) {
const uint32_t* const src_end = src + num_pixels;
while (src < src_end) {
const uint32_t argb = *src++;
WebPUint32ToMem(dst, BSwap32(argb));
dst += sizeof(argb);
}
} else {
memcpy(dst, src, num_pixels * sizeof(*src));
}
}
void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
WEBP_CSP_MODE out_colorspace, uint8_t* const rgba) {
switch (out_colorspace) {
case MODE_RGB:
VP8LConvertBGRAToRGB(in_data, num_pixels, rgba);
break;
case MODE_RGBA:
VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
break;
case MODE_rgbA:
VP8LConvertBGRAToRGBA(in_data, num_pixels, rgba);
WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
break;
case MODE_BGR:
VP8LConvertBGRAToBGR(in_data, num_pixels, rgba);
break;
case MODE_BGRA:
CopyOrSwap(in_data, num_pixels, rgba, 1);
break;
case MODE_bgrA:
CopyOrSwap(in_data, num_pixels, rgba, 1);
WebPApplyAlphaMultiply(rgba, 0, num_pixels, 1, 0);
break;
case MODE_ARGB:
CopyOrSwap(in_data, num_pixels, rgba, 0);
break;
case MODE_Argb:
CopyOrSwap(in_data, num_pixels, rgba, 0);
WebPApplyAlphaMultiply(rgba, 1, num_pixels, 1, 0);
break;
case MODE_RGBA_4444:
VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
break;
case MODE_rgbA_4444:
VP8LConvertBGRAToRGBA4444(in_data, num_pixels, rgba);
WebPApplyAlphaMultiply4444(rgba, num_pixels, 1, 0);
break;
case MODE_RGB_565:
VP8LConvertBGRAToRGB565(in_data, num_pixels, rgba);
break;
default:
assert(0); // Code flow should not reach here.
}
}
//------------------------------------------------------------------------------
VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
VP8LPredictorFunc VP8LPredictors[16];
// exposed plain-C implementations
VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
VP8LTransformColorInverseFunc VP8LTransformColorInverse;
VP8LConvertFunc VP8LConvertBGRAToRGB;
VP8LConvertFunc VP8LConvertBGRAToRGBA;
VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
VP8LConvertFunc VP8LConvertBGRAToRGB565;
VP8LConvertFunc VP8LConvertBGRAToBGR;
VP8LMapARGBFunc VP8LMapColor32b;
VP8LMapAlphaFunc VP8LMapColor8b;
extern void VP8LDspInitSSE2(void);
extern void VP8LDspInitNEON(void);
extern void VP8LDspInitMIPSdspR2(void);
extern void VP8LDspInitMSA(void);
#define COPY_PREDICTOR_ARRAY(IN, OUT) do { \
(OUT)[0] = IN##0_C; \
(OUT)[1] = IN##1_C; \
(OUT)[2] = IN##2_C; \
(OUT)[3] = IN##3_C; \
(OUT)[4] = IN##4_C; \
(OUT)[5] = IN##5_C; \
(OUT)[6] = IN##6_C; \
(OUT)[7] = IN##7_C; \
(OUT)[8] = IN##8_C; \
(OUT)[9] = IN##9_C; \
(OUT)[10] = IN##10_C; \
(OUT)[11] = IN##11_C; \
(OUT)[12] = IN##12_C; \
(OUT)[13] = IN##13_C; \
(OUT)[14] = IN##0_C; /* <- padding security sentinels*/ \
(OUT)[15] = IN##0_C; \
} while (0);
WEBP_DSP_INIT_FUNC(VP8LDspInit) {
COPY_PREDICTOR_ARRAY(VP8LPredictor, VP8LPredictors)
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
#if !WEBP_NEON_OMIT_C_CODE
VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
VP8LTransformColorInverse = VP8LTransformColorInverse_C;
VP8LConvertBGRAToRGBA = VP8LConvertBGRAToRGBA_C;
VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
#endif
VP8LConvertBGRAToRGBA4444 = VP8LConvertBGRAToRGBA4444_C;
VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
VP8LMapColor32b = MapARGB_C;
VP8LMapColor8b = MapAlpha_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8LDspInitSSE2();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8LDspInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_MSA)
if (VP8GetCPUInfo(kMSA)) {
VP8LDspInitMSA();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LDspInitNEON();
}
#endif
assert(VP8LAddGreenToBlueAndRed != NULL);
assert(VP8LTransformColorInverse != NULL);
assert(VP8LConvertBGRAToRGBA != NULL);
assert(VP8LConvertBGRAToRGB != NULL);
assert(VP8LConvertBGRAToBGR != NULL);
assert(VP8LConvertBGRAToRGBA4444 != NULL);
assert(VP8LConvertBGRAToRGB565 != NULL);
assert(VP8LMapColor32b != NULL);
assert(VP8LMapColor8b != NULL);
}
#undef COPY_PREDICTOR_ARRAY
//------------------------------------------------------------------------------

View File

@ -0,0 +1,244 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transforms and color space conversion methods for lossless decoder.
//
// Authors: Vikas Arora (vikaas.arora@gmail.com)
// Jyrki Alakuijala (jyrki@google.com)
#ifndef WEBP_DSP_LOSSLESS_H_
#define WEBP_DSP_LOSSLESS_H_
#include "webp_types.h"
#include "webp_decode.h"
#include "enc_histogram_enc.h"
#include "utils_utils.h"
#ifdef __cplusplus
extern "C" {
#endif
//------------------------------------------------------------------------------
// Decoding
typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
extern VP8LPredictorFunc VP8LPredictors[16];
uint32_t VP8LPredictor0_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor1_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor2_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor3_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor4_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor5_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor6_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor7_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor8_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor9_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor10_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor11_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor12_C(uint32_t left, const uint32_t* const top);
uint32_t VP8LPredictor13_C(uint32_t left, const uint32_t* const top);
// These Add/Sub function expects upper[-1] and out[-1] to be readable.
typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
const uint32_t* upper, int num_pixels,
uint32_t* out);
extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
typedef void (*VP8LProcessDecBlueAndRedFunc)(const uint32_t* src,
int num_pixels, uint32_t* dst);
extern VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
typedef struct {
// Note: the members are uint8_t, so that any negative values are
// automatically converted to "mod 256" values.
uint8_t green_to_red_;
uint8_t green_to_blue_;
uint8_t red_to_blue_;
} VP8LMultipliers;
typedef void (*VP8LTransformColorInverseFunc)(const VP8LMultipliers* const m,
const uint32_t* src,
int num_pixels, uint32_t* dst);
extern VP8LTransformColorInverseFunc VP8LTransformColorInverse;
struct VP8LTransform; // Defined in dec/vp8li.h.
// Performs inverse transform of data given transform information, start and end
// rows. Transform will be applied to rows [row_start, row_end[.
// The *in and *out pointers refer to source and destination data respectively
// corresponding to the intermediate row (row_start).
void VP8LInverseTransform(const struct VP8LTransform* const transform,
int row_start, int row_end,
const uint32_t* const in, uint32_t* const out);
// Color space conversion.
typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
uint8_t* dst);
extern VP8LConvertFunc VP8LConvertBGRAToRGB;
extern VP8LConvertFunc VP8LConvertBGRAToRGBA;
extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
extern VP8LConvertFunc VP8LConvertBGRAToBGR;
// Converts from BGRA to other color spaces.
void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
typedef void (*VP8LMapARGBFunc)(const uint32_t* src,
const uint32_t* const color_map,
uint32_t* dst, int y_start,
int y_end, int width);
typedef void (*VP8LMapAlphaFunc)(const uint8_t* src,
const uint32_t* const color_map,
uint8_t* dst, int y_start,
int y_end, int width);
extern VP8LMapARGBFunc VP8LMapColor32b;
extern VP8LMapAlphaFunc VP8LMapColor8b;
// Similar to the static method ColorIndexInverseTransform() that is part of
// lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
// uint32_t) arguments for 'src' and 'dst'.
void VP8LColorIndexInverseTransformAlpha(
const struct VP8LTransform* const transform, int y_start, int y_end,
const uint8_t* src, uint8_t* dst);
// Expose some C-only fallback functions
void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst);
void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
int num_pixels, uint8_t* dst);
void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
uint32_t* dst);
// Must be called before calling any of the above methods.
void VP8LDspInit(void);
//------------------------------------------------------------------------------
// Encoding
typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
uint32_t* dst, int num_pixels);
extern VP8LTransformColorFunc VP8LTransformColor;
typedef void (*VP8LCollectColorBlueTransformsFunc)(
const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue, int histo[]);
extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
typedef void (*VP8LCollectColorRedTransformsFunc)(
const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]);
extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
// Expose some C-only fallback functions
void VP8LTransformColor_C(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels);
void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]);
void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]);
extern VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
// -----------------------------------------------------------------------------
// Huffman-cost related functions.
typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
int length);
typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
const int Y[256]);
extern VP8LCostFunc VP8LExtraCost;
extern VP8LCostCombinedFunc VP8LExtraCostCombined;
extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
typedef struct { // small struct to hold counters
int counts[2]; // index: 0=zero streak, 1=non-zero streak
int streaks[2][2]; // [zero/non-zero][streak<3 / streak>=3]
} VP8LStreaks;
typedef struct { // small struct to hold bit entropy results
double entropy; // entropy
uint32_t sum; // sum of the population
int nonzeros; // number of non-zero elements in the population
uint32_t max_val; // maximum value in the population
uint32_t nonzero_code; // index of the last non-zero in the population
} VP8LBitEntropy;
void VP8LBitEntropyInit(VP8LBitEntropy* const entropy);
// Get the combined symbol bit entropy and Huffman cost stats for the
// distributions 'X' and 'Y'. Those results can then be refined according to
// codec specific heuristics.
typedef void (*VP8LGetCombinedEntropyUnrefinedFunc)(
const uint32_t X[], const uint32_t Y[], int length,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats);
extern VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
// Get the entropy for the distribution 'X'.
typedef void (*VP8LGetEntropyUnrefinedFunc)(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats);
extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
VP8LBitEntropy* const entropy);
typedef void (*VP8LAddVectorFunc)(const uint32_t* a, const uint32_t* b,
uint32_t* out, int size);
extern VP8LAddVectorFunc VP8LAddVector;
typedef void (*VP8LAddVectorEqFunc)(const uint32_t* a, uint32_t* out, int size);
extern VP8LAddVectorEqFunc VP8LAddVectorEq;
void VP8LHistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b,
VP8LHistogram* const out);
// -----------------------------------------------------------------------------
// PrefixEncode()
typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1,
const uint32_t* const array2, int length);
// Returns the first index where array1 and array2 are different.
extern VP8LVectorMismatchFunc VP8LVectorMismatch;
typedef void (*VP8LBundleColorMapFunc)(const uint8_t* const row, int width,
int xbits, uint32_t* dst);
extern VP8LBundleColorMapFunc VP8LBundleColorMap;
void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
uint32_t* dst);
// Must be called before calling any of the above methods.
void VP8LEncDspInit(void);
//------------------------------------------------------------------------------
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DSP_LOSSLESS_H_

View File

@ -0,0 +1,191 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transforms and color space conversion methods for lossless decoder.
//
// Authors: Vikas Arora (vikaas.arora@gmail.com)
// Jyrki Alakuijala (jyrki@google.com)
// Vincent Rabaud (vrabaud@google.com)
#ifndef WEBP_DSP_LOSSLESS_COMMON_H_
#define WEBP_DSP_LOSSLESS_COMMON_H_
#include "webp_types.h"
#include "utils_utils.h"
#ifdef __cplusplus
extern "C" {
#endif
//------------------------------------------------------------------------------
// Decoding
// color mapping related functions.
static WEBP_INLINE uint32_t VP8GetARGBIndex(uint32_t idx) {
return (idx >> 8) & 0xff;
}
static WEBP_INLINE uint8_t VP8GetAlphaIndex(uint8_t idx) {
return idx;
}
static WEBP_INLINE uint32_t VP8GetARGBValue(uint32_t val) {
return val;
}
static WEBP_INLINE uint8_t VP8GetAlphaValue(uint32_t val) {
return (val >> 8) & 0xff;
}
//------------------------------------------------------------------------------
// Misc methods.
// Computes sampled size of 'size' when sampling using 'sampling bits'.
static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
uint32_t sampling_bits) {
return (size + (1 << sampling_bits) - 1) >> sampling_bits;
}
// Converts near lossless quality into max number of bits shaved off.
static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) {
// 100 -> 0
// 80..99 -> 1
// 60..79 -> 2
// 40..59 -> 3
// 20..39 -> 4
// 0..19 -> 5
return 5 - near_lossless_quality / 20;
}
// -----------------------------------------------------------------------------
// Faster logarithm for integers. Small values use a look-up table.
// The threshold till approximate version of log_2 can be used.
// Practically, we can get rid of the call to log() as the two values match to
// very high degree (the ratio of these two is 0.99999x).
// Keeping a high threshold for now.
#define APPROX_LOG_WITH_CORRECTION_MAX 65536
#define APPROX_LOG_MAX 4096
#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
#define LOG_LOOKUP_IDX_MAX 256
extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
}
// Fast calculation of v * log2(v) for integer input.
static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
}
// -----------------------------------------------------------------------------
// PrefixEncode()
// Splitting of distance and length codes into prefixes and
// extra bits. The prefixes are encoded with an entropy code
// while the extra bits are stored just as normal bits.
static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
int* const extra_bits) {
const int highest_bit = BitsLog2Floor(--distance);
const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
*extra_bits = highest_bit - 1;
*code = 2 * highest_bit + second_highest_bit;
}
static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
int* const extra_bits,
int* const extra_bits_value) {
const int highest_bit = BitsLog2Floor(--distance);
const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
*extra_bits = highest_bit - 1;
*extra_bits_value = distance & ((1 << *extra_bits) - 1);
*code = 2 * highest_bit + second_highest_bit;
}
#define PREFIX_LOOKUP_IDX_MAX 512
typedef struct {
int8_t code_;
int8_t extra_bits_;
} VP8LPrefixCode;
// These tables are derived using VP8LPrefixEncodeNoLUT.
extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
int* const extra_bits) {
if (distance < PREFIX_LOOKUP_IDX_MAX) {
const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
*code = prefix_code.code_;
*extra_bits = prefix_code.extra_bits_;
} else {
VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
}
}
static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
int* const extra_bits,
int* const extra_bits_value) {
if (distance < PREFIX_LOOKUP_IDX_MAX) {
const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
*code = prefix_code.code_;
*extra_bits = prefix_code.extra_bits_;
*extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
} else {
VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
}
}
// Sum of each component, mod 256.
static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
uint32_t VP8LAddPixels(uint32_t a, uint32_t b) {
const uint32_t alpha_and_green = (a & 0xff00ff00u) + (b & 0xff00ff00u);
const uint32_t red_and_blue = (a & 0x00ff00ffu) + (b & 0x00ff00ffu);
return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
}
// Difference of each component, mod 256.
static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
const uint32_t alpha_and_green =
0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
const uint32_t red_and_blue =
0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
}
//------------------------------------------------------------------------------
// Transform-related functions use din both encoding and decoding.
// Macros used to create a batch predictor that iteratively uses a
// one-pixel predictor.
// The predictor is added to the output pixel (which
// is therefore considered as a residual) to get the final prediction.
#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD) \
static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int x; \
assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \
const uint32_t pred = (PREDICTOR)(out[x - 1], upper + x); \
out[x] = VP8LAddPixels(in[x], pred); \
} \
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DSP_LOSSLESS_COMMON_H_

View File

@ -0,0 +1,929 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transform methods for lossless encoder.
//
// Authors: Vikas Arora (vikaas.arora@gmail.com)
// Jyrki Alakuijala (jyrki@google.com)
// Urvang Joshi (urvang@google.com)
#include "dsp_dsp.h"
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include "dec_vp8li_dec.h"
#include "utils_endian_inl_utils.h"
#include "dsp_lossless.h"
#include "dsp_lossless_common.h"
#include "dsp_yuv.h"
// lookup table for small values of log2(int)
const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
0.0000000000000000f, 0.0000000000000000f,
1.0000000000000000f, 1.5849625007211560f,
2.0000000000000000f, 2.3219280948873621f,
2.5849625007211560f, 2.8073549220576041f,
3.0000000000000000f, 3.1699250014423121f,
3.3219280948873621f, 3.4594316186372973f,
3.5849625007211560f, 3.7004397181410921f,
3.8073549220576041f, 3.9068905956085187f,
4.0000000000000000f, 4.0874628412503390f,
4.1699250014423121f, 4.2479275134435852f,
4.3219280948873626f, 4.3923174227787606f,
4.4594316186372973f, 4.5235619560570130f,
4.5849625007211560f, 4.6438561897747243f,
4.7004397181410917f, 4.7548875021634682f,
4.8073549220576037f, 4.8579809951275718f,
4.9068905956085187f, 4.9541963103868749f,
5.0000000000000000f, 5.0443941193584533f,
5.0874628412503390f, 5.1292830169449663f,
5.1699250014423121f, 5.2094533656289501f,
5.2479275134435852f, 5.2854022188622487f,
5.3219280948873626f, 5.3575520046180837f,
5.3923174227787606f, 5.4262647547020979f,
5.4594316186372973f, 5.4918530963296747f,
5.5235619560570130f, 5.5545888516776376f,
5.5849625007211560f, 5.6147098441152083f,
5.6438561897747243f, 5.6724253419714951f,
5.7004397181410917f, 5.7279204545631987f,
5.7548875021634682f, 5.7813597135246599f,
5.8073549220576037f, 5.8328900141647412f,
5.8579809951275718f, 5.8826430493618415f,
5.9068905956085187f, 5.9307373375628866f,
5.9541963103868749f, 5.9772799234999167f,
6.0000000000000000f, 6.0223678130284543f,
6.0443941193584533f, 6.0660891904577720f,
6.0874628412503390f, 6.1085244567781691f,
6.1292830169449663f, 6.1497471195046822f,
6.1699250014423121f, 6.1898245588800175f,
6.2094533656289501f, 6.2288186904958804f,
6.2479275134435852f, 6.2667865406949010f,
6.2854022188622487f, 6.3037807481771030f,
6.3219280948873626f, 6.3398500028846243f,
6.3575520046180837f, 6.3750394313469245f,
6.3923174227787606f, 6.4093909361377017f,
6.4262647547020979f, 6.4429434958487279f,
6.4594316186372973f, 6.4757334309663976f,
6.4918530963296747f, 6.5077946401986963f,
6.5235619560570130f, 6.5391588111080309f,
6.5545888516776376f, 6.5698556083309478f,
6.5849625007211560f, 6.5999128421871278f,
6.6147098441152083f, 6.6293566200796094f,
6.6438561897747243f, 6.6582114827517946f,
6.6724253419714951f, 6.6865005271832185f,
6.7004397181410917f, 6.7142455176661224f,
6.7279204545631987f, 6.7414669864011464f,
6.7548875021634682f, 6.7681843247769259f,
6.7813597135246599f, 6.7944158663501061f,
6.8073549220576037f, 6.8201789624151878f,
6.8328900141647412f, 6.8454900509443747f,
6.8579809951275718f, 6.8703647195834047f,
6.8826430493618415f, 6.8948177633079437f,
6.9068905956085187f, 6.9188632372745946f,
6.9307373375628866f, 6.9425145053392398f,
6.9541963103868749f, 6.9657842846620869f,
6.9772799234999167f, 6.9886846867721654f,
7.0000000000000000f, 7.0112272554232539f,
7.0223678130284543f, 7.0334230015374501f,
7.0443941193584533f, 7.0552824355011898f,
7.0660891904577720f, 7.0768155970508308f,
7.0874628412503390f, 7.0980320829605263f,
7.1085244567781691f, 7.1189410727235076f,
7.1292830169449663f, 7.1395513523987936f,
7.1497471195046822f, 7.1598713367783890f,
7.1699250014423121f, 7.1799090900149344f,
7.1898245588800175f, 7.1996723448363644f,
7.2094533656289501f, 7.2191685204621611f,
7.2288186904958804f, 7.2384047393250785f,
7.2479275134435852f, 7.2573878426926521f,
7.2667865406949010f, 7.2761244052742375f,
7.2854022188622487f, 7.2946207488916270f,
7.3037807481771030f, 7.3128829552843557f,
7.3219280948873626f, 7.3309168781146167f,
7.3398500028846243f, 7.3487281542310771f,
7.3575520046180837f, 7.3663222142458160f,
7.3750394313469245f, 7.3837042924740519f,
7.3923174227787606f, 7.4008794362821843f,
7.4093909361377017f, 7.4178525148858982f,
7.4262647547020979f, 7.4346282276367245f,
7.4429434958487279f, 7.4512111118323289f,
7.4594316186372973f, 7.4676055500829976f,
7.4757334309663976f, 7.4838157772642563f,
7.4918530963296747f, 7.4998458870832056f,
7.5077946401986963f, 7.5156998382840427f,
7.5235619560570130f, 7.5313814605163118f,
7.5391588111080309f, 7.5468944598876364f,
7.5545888516776376f, 7.5622424242210728f,
7.5698556083309478f, 7.5774288280357486f,
7.5849625007211560f, 7.5924570372680806f,
7.5999128421871278f, 7.6073303137496104f,
7.6147098441152083f, 7.6220518194563764f,
7.6293566200796094f, 7.6366246205436487f,
7.6438561897747243f, 7.6510516911789281f,
7.6582114827517946f, 7.6653359171851764f,
7.6724253419714951f, 7.6794800995054464f,
7.6865005271832185f, 7.6934869574993252f,
7.7004397181410917f, 7.7073591320808825f,
7.7142455176661224f, 7.7210991887071855f,
7.7279204545631987f, 7.7347096202258383f,
7.7414669864011464f, 7.7481928495894605f,
7.7548875021634682f, 7.7615512324444795f,
7.7681843247769259f, 7.7747870596011736f,
7.7813597135246599f, 7.7879025593914317f,
7.7944158663501061f, 7.8008998999203047f,
7.8073549220576037f, 7.8137811912170374f,
7.8201789624151878f, 7.8265484872909150f,
7.8328900141647412f, 7.8392037880969436f,
7.8454900509443747f, 7.8517490414160571f,
7.8579809951275718f, 7.8641861446542797f,
7.8703647195834047f, 7.8765169465649993f,
7.8826430493618415f, 7.8887432488982591f,
7.8948177633079437f, 7.9008668079807486f,
7.9068905956085187f, 7.9128893362299619f,
7.9188632372745946f, 7.9248125036057812f,
7.9307373375628866f, 7.9366379390025709f,
7.9425145053392398f, 7.9483672315846778f,
7.9541963103868749f, 7.9600019320680805f,
7.9657842846620869f, 7.9715435539507719f,
7.9772799234999167f, 7.9829935746943103f,
7.9886846867721654f, 7.9943534368588577f
};
const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
0.00000000f, 0.00000000f, 2.00000000f, 4.75488750f,
8.00000000f, 11.60964047f, 15.50977500f, 19.65148445f,
24.00000000f, 28.52932501f, 33.21928095f, 38.05374781f,
43.01955001f, 48.10571634f, 53.30296891f, 58.60335893f,
64.00000000f, 69.48686830f, 75.05865003f, 80.71062276f,
86.43856190f, 92.23866588f, 98.10749561f, 104.04192499f,
110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
};
const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
{ 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
{ 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
{ 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
{ 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
{ 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
{10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
{10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
{11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
{11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
{12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
{12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
{12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
{12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
{13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
{13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
{13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
{13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
{14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
{15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
{16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
{17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
};
const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 2, 3, 0, 1, 2, 3,
0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
127,
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
};
static float FastSLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
int log_cnt = 0;
uint32_t y = 1;
int correction = 0;
const float v_f = (float)v;
const uint32_t orig_v = v;
do {
++log_cnt;
v = v >> 1;
y = y << 1;
} while (v >= LOG_LOOKUP_IDX_MAX);
// vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
// Xf = floor(Xf) * (1 + (v % y) / v)
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
// The correction factor: log(1 + d) ~ d; for very small d values, so
// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
// LOG_2_RECIPROCAL ~ 23/16
correction = (23 * (orig_v & (y - 1))) >> 4;
return v_f * (kLog2Table[v] + log_cnt) + correction;
} else {
return (float)(LOG_2_RECIPROCAL * v * log((double)v));
}
}
static float FastLog2Slow_C(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
int log_cnt = 0;
uint32_t y = 1;
const uint32_t orig_v = v;
double log_2;
do {
++log_cnt;
v = v >> 1;
y = y << 1;
} while (v >= LOG_LOOKUP_IDX_MAX);
log_2 = kLog2Table[v] + log_cnt;
if (orig_v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only
// for large values of 'v'.
const int correction = (23 * (orig_v & (y - 1))) >> 4;
log_2 += (double)correction / orig_v;
}
return (float)log_2;
} else {
return (float)(LOG_2_RECIPROCAL * log((double)v));
}
}
//------------------------------------------------------------------------------
// Methods to calculate Entropy (Shannon).
// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
static float CombinedShannonEntropy_C(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX = 0, sumXY = 0;
for (i = 0; i < 256; ++i) {
const int x = X[i];
if (x != 0) {
const int xy = x + Y[i];
sumX += x;
retval -= VP8LFastSLog2(x);
sumXY += xy;
retval -= VP8LFastSLog2(xy);
} else if (Y[i] != 0) {
sumXY += Y[i];
retval -= VP8LFastSLog2(Y[i]);
}
}
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
return (float)retval;
}
void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
entropy->entropy = 0.;
entropy->sum = 0;
entropy->nonzeros = 0;
entropy->max_val = 0;
entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
}
void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
VP8LBitEntropy* const entropy) {
int i;
VP8LBitEntropyInit(entropy);
for (i = 0; i < n; ++i) {
if (array[i] != 0) {
entropy->sum += array[i];
entropy->nonzero_code = i;
++entropy->nonzeros;
entropy->entropy -= VP8LFastSLog2(array[i]);
if (entropy->max_val < array[i]) {
entropy->max_val = array[i];
}
}
}
entropy->entropy += VP8LFastSLog2(entropy->sum);
}
static WEBP_INLINE void GetEntropyUnrefinedHelper(
uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
const int streak = i - *i_prev;
// Gather info for the bit entropy.
if (*val_prev != 0) {
bit_entropy->sum += (*val_prev) * streak;
bit_entropy->nonzeros += streak;
bit_entropy->nonzero_code = *i_prev;
bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
if (bit_entropy->max_val < *val_prev) {
bit_entropy->max_val = *val_prev;
}
}
// Gather info for the Huffman cost.
stats->counts[*val_prev != 0] += (streak > 3);
stats->streaks[*val_prev != 0][(streak > 3)] += streak;
*val_prev = val;
*i_prev = i;
}
static void GetEntropyUnrefined_C(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
memset(stats, 0, sizeof(*stats));
VP8LBitEntropyInit(bit_entropy);
for (i = 1; i < length; ++i) {
const uint32_t x = X[i];
if (x != x_prev) {
GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
}
}
GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
static void GetCombinedEntropyUnrefined_C(const uint32_t X[],
const uint32_t Y[],
int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
memset(stats, 0, sizeof(*stats));
VP8LBitEntropyInit(bit_entropy);
for (i = 1; i < length; ++i) {
const uint32_t xy = X[i] + Y[i];
if (xy != xy_prev) {
GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
}
}
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
//------------------------------------------------------------------------------
void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const int argb = argb_data[i];
const int green = (argb >> 8) & 0xff;
const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
const uint32_t new_b = (((argb >> 0) & 0xff) - green) & 0xff;
argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b;
}
}
static WEBP_INLINE int ColorTransformDelta(int8_t color_pred, int8_t color) {
return ((int)color_pred * color) >> 5;
}
static WEBP_INLINE int8_t U32ToS8(uint32_t v) {
return (int8_t)(v & 0xff);
}
void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
int i;
for (i = 0; i < num_pixels; ++i) {
const uint32_t argb = data[i];
const int8_t green = U32ToS8(argb >> 8);
const int8_t red = U32ToS8(argb >> 16);
int new_red = red & 0xff;
int new_blue = argb & 0xff;
new_red -= ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
new_blue -= ColorTransformDelta(m->red_to_blue_, red);
new_blue &= 0xff;
data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
}
static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
uint32_t argb) {
const int8_t green = U32ToS8(argb >> 8);
int new_red = argb >> 16;
new_red -= ColorTransformDelta(green_to_red, green);
return (new_red & 0xff);
}
static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
uint8_t red_to_blue,
uint32_t argb) {
const int8_t green = U32ToS8(argb >> 8);
const int8_t red = U32ToS8(argb >> 16);
uint8_t new_blue = argb & 0xff;
new_blue -= ColorTransformDelta(green_to_blue, green);
new_blue -= ColorTransformDelta(red_to_blue, red);
return (new_blue & 0xff);
}
void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
while (tile_height-- > 0) {
int x;
for (x = 0; x < tile_width; ++x) {
++histo[TransformColorRed((uint8_t)green_to_red, argb[x])];
}
argb += stride;
}
}
void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
while (tile_height-- > 0) {
int x;
for (x = 0; x < tile_width; ++x) {
++histo[TransformColorBlue((uint8_t)green_to_blue, (uint8_t)red_to_blue,
argb[x])];
}
argb += stride;
}
}
//------------------------------------------------------------------------------
static int VectorMismatch_C(const uint32_t* const array1,
const uint32_t* const array2, int length) {
int match_len = 0;
while (match_len < length && array1[match_len] == array2[match_len]) {
++match_len;
}
return match_len;
}
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
uint32_t* dst) {
int x;
if (xbits > 0) {
const int bit_depth = 1 << (3 - xbits);
const int mask = (1 << xbits) - 1;
uint32_t code = 0xff000000;
for (x = 0; x < width; ++x) {
const int xsub = x & mask;
if (xsub == 0) {
code = 0xff000000;
}
code |= row[x] << (8 + bit_depth * xsub);
dst[x >> xbits] = code;
}
} else {
for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
}
}
//------------------------------------------------------------------------------
static double ExtraCost_C(const uint32_t* population, int length) {
int i;
double cost = 0.;
for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
return cost;
}
static double ExtraCostCombined_C(const uint32_t* X, const uint32_t* Y,
int length) {
int i;
double cost = 0.;
for (i = 2; i < length - 2; ++i) {
const int xy = X[i + 2] + Y[i + 2];
cost += (i >> 1) * xy;
}
return cost;
}
//------------------------------------------------------------------------------
static void AddVector_C(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
int i;
for (i = 0; i < size; ++i) out[i] = a[i] + b[i];
}
static void AddVectorEq_C(const uint32_t* a, uint32_t* out, int size) {
int i;
for (i = 0; i < size; ++i) out[i] += a[i];
}
#define ADD(X, ARG, LEN) do { \
if (a->is_used_[X]) { \
if (b->is_used_[X]) { \
VP8LAddVector(a->ARG, b->ARG, out->ARG, (LEN)); \
} else { \
memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0])); \
} \
} else if (b->is_used_[X]) { \
memcpy(&out->ARG[0], &b->ARG[0], (LEN) * sizeof(out->ARG[0])); \
} else { \
memset(&out->ARG[0], 0, (LEN) * sizeof(out->ARG[0])); \
} \
} while (0)
#define ADD_EQ(X, ARG, LEN) do { \
if (a->is_used_[X]) { \
if (out->is_used_[X]) { \
VP8LAddVectorEq(a->ARG, out->ARG, (LEN)); \
} else { \
memcpy(&out->ARG[0], &a->ARG[0], (LEN) * sizeof(out->ARG[0])); \
} \
} \
} while (0)
void VP8LHistogramAdd(const VP8LHistogram* const a,
const VP8LHistogram* const b, VP8LHistogram* const out) {
int i;
const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
assert(a->palette_code_bits_ == b->palette_code_bits_);
if (b != out) {
ADD(0, literal_, literal_size);
ADD(1, red_, NUM_LITERAL_CODES);
ADD(2, blue_, NUM_LITERAL_CODES);
ADD(3, alpha_, NUM_LITERAL_CODES);
ADD(4, distance_, NUM_DISTANCE_CODES);
for (i = 0; i < 5; ++i) {
out->is_used_[i] = (a->is_used_[i] | b->is_used_[i]);
}
} else {
ADD_EQ(0, literal_, literal_size);
ADD_EQ(1, red_, NUM_LITERAL_CODES);
ADD_EQ(2, blue_, NUM_LITERAL_CODES);
ADD_EQ(3, alpha_, NUM_LITERAL_CODES);
ADD_EQ(4, distance_, NUM_DISTANCE_CODES);
for (i = 0; i < 5; ++i) out->is_used_[i] |= a->is_used_[i];
}
}
#undef ADD
#undef ADD_EQ
//------------------------------------------------------------------------------
// Image transforms.
static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
(void)upper;
}
static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
(void)upper;
}
// It subtracts the prediction from the input pixel and stores the residual
// in the output pixel.
#define GENERATE_PREDICTOR_SUB(PREDICTOR_I) \
static void PredictorSub##PREDICTOR_I##_C(const uint32_t* in, \
const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int x; \
assert(upper != NULL); \
for (x = 0; x < num_pixels; ++x) { \
const uint32_t pred = \
VP8LPredictor##PREDICTOR_I##_C(in[x - 1], upper + x); \
out[x] = VP8LSubPixels(in[x], pred); \
} \
}
GENERATE_PREDICTOR_SUB(2)
GENERATE_PREDICTOR_SUB(3)
GENERATE_PREDICTOR_SUB(4)
GENERATE_PREDICTOR_SUB(5)
GENERATE_PREDICTOR_SUB(6)
GENERATE_PREDICTOR_SUB(7)
GENERATE_PREDICTOR_SUB(8)
GENERATE_PREDICTOR_SUB(9)
GENERATE_PREDICTOR_SUB(10)
GENERATE_PREDICTOR_SUB(11)
GENERATE_PREDICTOR_SUB(12)
GENERATE_PREDICTOR_SUB(13)
//------------------------------------------------------------------------------
VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
VP8LTransformColorFunc VP8LTransformColor;
VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
VP8LFastLog2SlowFunc VP8LFastLog2Slow;
VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
VP8LCostFunc VP8LExtraCost;
VP8LCostCombinedFunc VP8LExtraCostCombined;
VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
VP8LAddVectorFunc VP8LAddVector;
VP8LAddVectorEqFunc VP8LAddVectorEq;
VP8LVectorMismatchFunc VP8LVectorMismatch;
VP8LBundleColorMapFunc VP8LBundleColorMap;
VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
extern void VP8LEncDspInitSSE2(void);
extern void VP8LEncDspInitSSE41(void);
extern void VP8LEncDspInitNEON(void);
extern void VP8LEncDspInitMIPS32(void);
extern void VP8LEncDspInitMIPSdspR2(void);
extern void VP8LEncDspInitMSA(void);
WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
VP8LDspInit();
#if !WEBP_NEON_OMIT_C_CODE
VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
VP8LTransformColor = VP8LTransformColor_C;
#endif
VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
VP8LFastLog2Slow = FastLog2Slow_C;
VP8LFastSLog2Slow = FastSLog2Slow_C;
VP8LExtraCost = ExtraCost_C;
VP8LExtraCostCombined = ExtraCostCombined_C;
VP8LCombinedShannonEntropy = CombinedShannonEntropy_C;
VP8LGetEntropyUnrefined = GetEntropyUnrefined_C;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_C;
VP8LAddVector = AddVector_C;
VP8LAddVectorEq = AddVectorEq_C;
VP8LVectorMismatch = VectorMismatch_C;
VP8LBundleColorMap = VP8LBundleColorMap_C;
VP8LPredictorsSub[0] = PredictorSub0_C;
VP8LPredictorsSub[1] = PredictorSub1_C;
VP8LPredictorsSub[2] = PredictorSub2_C;
VP8LPredictorsSub[3] = PredictorSub3_C;
VP8LPredictorsSub[4] = PredictorSub4_C;
VP8LPredictorsSub[5] = PredictorSub5_C;
VP8LPredictorsSub[6] = PredictorSub6_C;
VP8LPredictorsSub[7] = PredictorSub7_C;
VP8LPredictorsSub[8] = PredictorSub8_C;
VP8LPredictorsSub[9] = PredictorSub9_C;
VP8LPredictorsSub[10] = PredictorSub10_C;
VP8LPredictorsSub[11] = PredictorSub11_C;
VP8LPredictorsSub[12] = PredictorSub12_C;
VP8LPredictorsSub[13] = PredictorSub13_C;
VP8LPredictorsSub[14] = PredictorSub0_C; // <- padding security sentinels
VP8LPredictorsSub[15] = PredictorSub0_C;
VP8LPredictorsSub_C[0] = PredictorSub0_C;
VP8LPredictorsSub_C[1] = PredictorSub1_C;
VP8LPredictorsSub_C[2] = PredictorSub2_C;
VP8LPredictorsSub_C[3] = PredictorSub3_C;
VP8LPredictorsSub_C[4] = PredictorSub4_C;
VP8LPredictorsSub_C[5] = PredictorSub5_C;
VP8LPredictorsSub_C[6] = PredictorSub6_C;
VP8LPredictorsSub_C[7] = PredictorSub7_C;
VP8LPredictorsSub_C[8] = PredictorSub8_C;
VP8LPredictorsSub_C[9] = PredictorSub9_C;
VP8LPredictorsSub_C[10] = PredictorSub10_C;
VP8LPredictorsSub_C[11] = PredictorSub11_C;
VP8LPredictorsSub_C[12] = PredictorSub12_C;
VP8LPredictorsSub_C[13] = PredictorSub13_C;
VP8LPredictorsSub_C[14] = PredictorSub0_C; // <- padding security sentinels
VP8LPredictorsSub_C[15] = PredictorSub0_C;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8LEncDspInitSSE2();
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
VP8LEncDspInitSSE41();
}
#endif
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
VP8LEncDspInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
VP8LEncDspInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_MSA)
if (VP8GetCPUInfo(kMSA)) {
VP8LEncDspInitMSA();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
VP8LEncDspInitNEON();
}
#endif
assert(VP8LSubtractGreenFromBlueAndRed != NULL);
assert(VP8LTransformColor != NULL);
assert(VP8LCollectColorBlueTransforms != NULL);
assert(VP8LCollectColorRedTransforms != NULL);
assert(VP8LFastLog2Slow != NULL);
assert(VP8LFastSLog2Slow != NULL);
assert(VP8LExtraCost != NULL);
assert(VP8LExtraCostCombined != NULL);
assert(VP8LCombinedShannonEntropy != NULL);
assert(VP8LGetEntropyUnrefined != NULL);
assert(VP8LGetCombinedEntropyUnrefined != NULL);
assert(VP8LAddVector != NULL);
assert(VP8LAddVectorEq != NULL);
assert(VP8LVectorMismatch != NULL);
assert(VP8LBundleColorMap != NULL);
assert(VP8LPredictorsSub[0] != NULL);
assert(VP8LPredictorsSub[1] != NULL);
assert(VP8LPredictorsSub[2] != NULL);
assert(VP8LPredictorsSub[3] != NULL);
assert(VP8LPredictorsSub[4] != NULL);
assert(VP8LPredictorsSub[5] != NULL);
assert(VP8LPredictorsSub[6] != NULL);
assert(VP8LPredictorsSub[7] != NULL);
assert(VP8LPredictorsSub[8] != NULL);
assert(VP8LPredictorsSub[9] != NULL);
assert(VP8LPredictorsSub[10] != NULL);
assert(VP8LPredictorsSub[11] != NULL);
assert(VP8LPredictorsSub[12] != NULL);
assert(VP8LPredictorsSub[13] != NULL);
assert(VP8LPredictorsSub[14] != NULL);
assert(VP8LPredictorsSub[15] != NULL);
assert(VP8LPredictorsSub_C[0] != NULL);
assert(VP8LPredictorsSub_C[1] != NULL);
assert(VP8LPredictorsSub_C[2] != NULL);
assert(VP8LPredictorsSub_C[3] != NULL);
assert(VP8LPredictorsSub_C[4] != NULL);
assert(VP8LPredictorsSub_C[5] != NULL);
assert(VP8LPredictorsSub_C[6] != NULL);
assert(VP8LPredictorsSub_C[7] != NULL);
assert(VP8LPredictorsSub_C[8] != NULL);
assert(VP8LPredictorsSub_C[9] != NULL);
assert(VP8LPredictorsSub_C[10] != NULL);
assert(VP8LPredictorsSub_C[11] != NULL);
assert(VP8LPredictorsSub_C[12] != NULL);
assert(VP8LPredictorsSub_C[13] != NULL);
assert(VP8LPredictorsSub_C[14] != NULL);
assert(VP8LPredictorsSub_C[15] != NULL);
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,397 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of lossless functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "dsp_dsp.h"
#include "dsp_lossless.h"
#include "dsp_lossless_common.h"
#if defined(WEBP_USE_MIPS32)
#include <assert.h>
#include <math.h>
#include <stdlib.h>
#include <string.h>
static float FastSLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y, correction;
const int c24 = 24;
const float v_f = (float)v;
uint32_t temp;
// Xf = 256 = 2^8
// log_cnt is index of leading one in upper 24 bits
__asm__ volatile(
"clz %[log_cnt], %[v] \n\t"
"addiu %[y], $zero, 1 \n\t"
"subu %[log_cnt], %[c24], %[log_cnt] \n\t"
"sllv %[y], %[y], %[log_cnt] \n\t"
"srlv %[temp], %[v], %[log_cnt] \n\t"
: [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
[temp]"=r"(temp)
: [c24]"r"(c24), [v]"r"(v)
);
// vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
// Xf = floor(Xf) * (1 + (v % y) / v)
// log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
// The correction factor: log(1 + d) ~ d; for very small d values, so
// log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
// LOG_2_RECIPROCAL ~ 23/16
// (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1)
correction = (23 * (v & (y - 1))) >> 4;
return v_f * (kLog2Table[temp] + log_cnt) + correction;
} else {
return (float)(LOG_2_RECIPROCAL * v * log((double)v));
}
}
static float FastLog2Slow_MIPS32(uint32_t v) {
assert(v >= LOG_LOOKUP_IDX_MAX);
if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
uint32_t log_cnt, y;
const int c24 = 24;
double log_2;
uint32_t temp;
__asm__ volatile(
"clz %[log_cnt], %[v] \n\t"
"addiu %[y], $zero, 1 \n\t"
"subu %[log_cnt], %[c24], %[log_cnt] \n\t"
"sllv %[y], %[y], %[log_cnt] \n\t"
"srlv %[temp], %[v], %[log_cnt] \n\t"
: [log_cnt]"=&r"(log_cnt), [y]"=&r"(y),
[temp]"=r"(temp)
: [c24]"r"(c24), [v]"r"(v)
);
log_2 = kLog2Table[temp] + log_cnt;
if (v >= APPROX_LOG_MAX) {
// Since the division is still expensive, add this correction factor only
// for large values of 'v'.
const uint32_t correction = (23 * (v & (y - 1))) >> 4;
log_2 += (double)correction / v;
}
return (float)log_2;
} else {
return (float)(LOG_2_RECIPROCAL * log((double)v));
}
}
// C version of this function:
// int i = 0;
// int64_t cost = 0;
// const uint32_t* pop = &population[4];
// const uint32_t* LoopEnd = &population[length];
// while (pop != LoopEnd) {
// ++i;
// cost += i * *pop;
// cost += i * *(pop + 1);
// pop += 2;
// }
// return (double)cost;
static double ExtraCost_MIPS32(const uint32_t* const population, int length) {
int i, temp0, temp1;
const uint32_t* pop = &population[4];
const uint32_t* const LoopEnd = &population[length];
__asm__ volatile(
"mult $zero, $zero \n\t"
"xor %[i], %[i], %[i] \n\t"
"beq %[pop], %[LoopEnd], 2f \n\t"
"1: \n\t"
"lw %[temp0], 0(%[pop]) \n\t"
"lw %[temp1], 4(%[pop]) \n\t"
"addiu %[i], %[i], 1 \n\t"
"addiu %[pop], %[pop], 8 \n\t"
"madd %[i], %[temp0] \n\t"
"madd %[i], %[temp1] \n\t"
"bne %[pop], %[LoopEnd], 1b \n\t"
"2: \n\t"
"mfhi %[temp0] \n\t"
"mflo %[temp1] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[i]"=&r"(i), [pop]"+r"(pop)
: [LoopEnd]"r"(LoopEnd)
: "memory", "hi", "lo"
);
return (double)((int64_t)temp0 << 32 | temp1);
}
// C version of this function:
// int i = 0;
// int64_t cost = 0;
// const uint32_t* pX = &X[4];
// const uint32_t* pY = &Y[4];
// const uint32_t* LoopEnd = &X[length];
// while (pX != LoopEnd) {
// const uint32_t xy0 = *pX + *pY;
// const uint32_t xy1 = *(pX + 1) + *(pY + 1);
// ++i;
// cost += i * xy0;
// cost += i * xy1;
// pX += 2;
// pY += 2;
// }
// return (double)cost;
static double ExtraCostCombined_MIPS32(const uint32_t* const X,
const uint32_t* const Y, int length) {
int i, temp0, temp1, temp2, temp3;
const uint32_t* pX = &X[4];
const uint32_t* pY = &Y[4];
const uint32_t* const LoopEnd = &X[length];
__asm__ volatile(
"mult $zero, $zero \n\t"
"xor %[i], %[i], %[i] \n\t"
"beq %[pX], %[LoopEnd], 2f \n\t"
"1: \n\t"
"lw %[temp0], 0(%[pX]) \n\t"
"lw %[temp1], 0(%[pY]) \n\t"
"lw %[temp2], 4(%[pX]) \n\t"
"lw %[temp3], 4(%[pY]) \n\t"
"addiu %[i], %[i], 1 \n\t"
"addu %[temp0], %[temp0], %[temp1] \n\t"
"addu %[temp2], %[temp2], %[temp3] \n\t"
"addiu %[pX], %[pX], 8 \n\t"
"addiu %[pY], %[pY], 8 \n\t"
"madd %[i], %[temp0] \n\t"
"madd %[i], %[temp2] \n\t"
"bne %[pX], %[LoopEnd], 1b \n\t"
"2: \n\t"
"mfhi %[temp0] \n\t"
"mflo %[temp1] \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[i]"=&r"(i), [pX]"+r"(pX), [pY]"+r"(pY)
: [LoopEnd]"r"(LoopEnd)
: "memory", "hi", "lo"
);
return (double)((int64_t)temp0 << 32 | temp1);
}
#define HUFFMAN_COST_PASS \
__asm__ volatile( \
"sll %[temp1], %[temp0], 3 \n\t" \
"addiu %[temp3], %[streak], -3 \n\t" \
"addu %[temp2], %[pstreaks], %[temp1] \n\t" \
"blez %[temp3], 1f \n\t" \
"srl %[temp1], %[temp1], 1 \n\t" \
"addu %[temp3], %[pcnts], %[temp1] \n\t" \
"lw %[temp0], 4(%[temp2]) \n\t" \
"lw %[temp1], 0(%[temp3]) \n\t" \
"addu %[temp0], %[temp0], %[streak] \n\t" \
"addiu %[temp1], %[temp1], 1 \n\t" \
"sw %[temp0], 4(%[temp2]) \n\t" \
"sw %[temp1], 0(%[temp3]) \n\t" \
"b 2f \n\t" \
"1: \n\t" \
"lw %[temp0], 0(%[temp2]) \n\t" \
"addu %[temp0], %[temp0], %[streak] \n\t" \
"sw %[temp0], 0(%[temp2]) \n\t" \
"2: \n\t" \
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
[temp3]"=&r"(temp3), [temp0]"+r"(temp0) \
: [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \
[streak]"r"(streak) \
: "memory" \
);
// Returns the various RLE counts
static WEBP_INLINE void GetEntropyUnrefinedHelper(
uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
int* const pstreaks = &stats->streaks[0][0];
int* const pcnts = &stats->counts[0];
int temp0, temp1, temp2, temp3;
const int streak = i - *i_prev;
// Gather info for the bit entropy.
if (*val_prev != 0) {
bit_entropy->sum += (*val_prev) * streak;
bit_entropy->nonzeros += streak;
bit_entropy->nonzero_code = *i_prev;
bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
if (bit_entropy->max_val < *val_prev) {
bit_entropy->max_val = *val_prev;
}
}
// Gather info for the Huffman cost.
temp0 = (*val_prev != 0);
HUFFMAN_COST_PASS
*val_prev = val;
*i_prev = i;
}
static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length,
VP8LBitEntropy* const bit_entropy,
VP8LStreaks* const stats) {
int i;
int i_prev = 0;
uint32_t x_prev = X[0];
memset(stats, 0, sizeof(*stats));
VP8LBitEntropyInit(bit_entropy);
for (i = 1; i < length; ++i) {
const uint32_t x = X[i];
if (x != x_prev) {
GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
}
}
GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
}
static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[],
const uint32_t Y[],
int length,
VP8LBitEntropy* const entropy,
VP8LStreaks* const stats) {
int i = 1;
int i_prev = 0;
uint32_t xy_prev = X[0] + Y[0];
memset(stats, 0, sizeof(*stats));
VP8LBitEntropyInit(entropy);
for (i = 1; i < length; ++i) {
const uint32_t xy = X[i] + Y[i];
if (xy != xy_prev) {
GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats);
}
}
GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats);
entropy->entropy += VP8LFastSLog2(entropy->sum);
}
#define ASM_START \
__asm__ volatile( \
".set push \n\t" \
".set at \n\t" \
".set macro \n\t" \
"1: \n\t"
// P2 = P0 + P1
// A..D - offsets
// E - temp variable to tell macro
// if pointer should be incremented
// literal_ and successive histograms could be unaligned
// so we must use ulw and usw
#define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \
"ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \
"ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \
"ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \
"ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \
"ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \
"ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \
"ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \
"ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \
"addu %[temp4], %[temp4], %[temp0] \n\t" \
"addu %[temp5], %[temp5], %[temp1] \n\t" \
"addu %[temp6], %[temp6], %[temp2] \n\t" \
"addu %[temp7], %[temp7], %[temp3] \n\t" \
"addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \
".if " #E " == 1 \n\t" \
"addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \
".endif \n\t" \
"usw %[temp4], " #A "(%[" #P2 "]) \n\t" \
"usw %[temp5], " #B "(%[" #P2 "]) \n\t" \
"usw %[temp6], " #C "(%[" #P2 "]) \n\t" \
"usw %[temp7], " #D "(%[" #P2 "]) \n\t" \
"addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \
"bne %[" #P0 "], %[LoopEnd], 1b \n\t" \
".set pop \n\t" \
#define ASM_END_COMMON_0 \
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \
[pa]"+r"(pa), [pout]"+r"(pout)
#define ASM_END_COMMON_1 \
: [LoopEnd]"r"(LoopEnd) \
: "memory", "at" \
);
#define ASM_END_0 \
ASM_END_COMMON_0 \
, [pb]"+r"(pb) \
ASM_END_COMMON_1
#define ASM_END_1 \
ASM_END_COMMON_0 \
ASM_END_COMMON_1
static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb,
uint32_t* pout, int size) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const uint32_t end = ((size) / 4) * 4;
const uint32_t* const LoopEnd = pa + end;
int i;
ASM_START
ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout)
ASM_END_0
for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i];
}
static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const uint32_t end = ((size) / 4) * 4;
const uint32_t* const LoopEnd = pa + end;
int i;
ASM_START
ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout)
ASM_END_1
for (i = end; i < size; ++i) pout[i] += pa[i];
}
#undef ASM_END_1
#undef ASM_END_0
#undef ASM_END_COMMON_1
#undef ASM_END_COMMON_0
#undef ADD_TO_OUT
#undef ASM_START
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
VP8LFastSLog2Slow = FastSLog2Slow_MIPS32;
VP8LFastLog2Slow = FastLog2Slow_MIPS32;
VP8LExtraCost = ExtraCost_MIPS32;
VP8LExtraCostCombined = ExtraCostCombined_MIPS32;
VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32;
VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32;
VP8LAddVector = AddVector_MIPS32;
VP8LAddVectorEq = AddVectorEq_MIPS32;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
#endif // WEBP_USE_MIPS32

View File

@ -0,0 +1,281 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transform methods for lossless encoder.
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "dsp_lossless.h"
static void SubtractGreenFromBlueAndRed_MIPSdspR2(uint32_t* argb_data,
int num_pixels) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[argb_data], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[argb_data]) \n\t"
"lw %[temp1], 4(%[argb_data]) \n\t"
"lw %[temp2], 8(%[argb_data]) \n\t"
"lw %[temp3], 12(%[argb_data]) \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"ext %[temp5], %[temp1], 8, 8 \n\t"
"ext %[temp6], %[temp2], 8, 8 \n\t"
"ext %[temp7], %[temp3], 8, 8 \n\t"
"addiu %[argb_data], %[argb_data], 16 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"replv.ph %[temp5], %[temp5] \n\t"
"replv.ph %[temp6], %[temp6] \n\t"
"replv.ph %[temp7], %[temp7] \n\t"
"subu.qb %[temp0], %[temp0], %[temp4] \n\t"
"subu.qb %[temp1], %[temp1], %[temp5] \n\t"
"subu.qb %[temp2], %[temp2], %[temp6] \n\t"
"subu.qb %[temp3], %[temp3], %[temp7] \n\t"
"sw %[temp0], -16(%[argb_data]) \n\t"
"sw %[temp1], -12(%[argb_data]) \n\t"
"sw %[temp2], -8(%[argb_data]) \n\t"
"bne %[argb_data], %[p_loop1_end], 0b \n\t"
" sw %[temp3], -4(%[argb_data]) \n\t"
"3: \n\t"
"beq %[argb_data], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[argb_data]) \n\t"
"addiu %[argb_data], %[argb_data], 4 \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"subu.qb %[temp0], %[temp0], %[temp4] \n\t"
"bne %[argb_data], %[p_loop2_end], 1b \n\t"
" sw %[temp0], -4(%[argb_data]) \n\t"
"2: \n\t"
".set pop \n\t"
: [argb_data]"+&r"(argb_data), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
[temp7]"=&r"(temp7)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
int8_t color) {
return (uint32_t)((int)(color_pred) * color) >> 5;
}
static void TransformColor_MIPSdspR2(const VP8LMultipliers* const m,
uint32_t* data, int num_pixels) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red, new_red1;
const uint32_t G_to_R = m->green_to_red_;
const uint32_t G_to_B = m->green_to_blue_;
const uint32_t R_to_B = m->red_to_blue_;
uint32_t* const p_loop_end = data + (num_pixels & ~1);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[data], %[p_loop_end], 1f \n\t"
" nop \n\t"
"replv.ph %[temp0], %[G_to_R] \n\t"
"replv.ph %[temp1], %[G_to_B] \n\t"
"replv.ph %[temp2], %[R_to_B] \n\t"
"shll.ph %[temp0], %[temp0], 8 \n\t"
"shll.ph %[temp1], %[temp1], 8 \n\t"
"shll.ph %[temp2], %[temp2], 8 \n\t"
"shra.ph %[temp0], %[temp0], 8 \n\t"
"shra.ph %[temp1], %[temp1], 8 \n\t"
"shra.ph %[temp2], %[temp2], 8 \n\t"
"0: \n\t"
"lw %[argb], 0(%[data]) \n\t"
"lw %[argb1], 4(%[data]) \n\t"
"lhu %[new_red], 2(%[data]) \n\t"
"lhu %[new_red1], 6(%[data]) \n\t"
"precrq.qb.ph %[temp3], %[argb], %[argb1] \n\t"
"precr.qb.ph %[temp4], %[argb], %[argb1] \n\t"
"preceu.ph.qbra %[temp3], %[temp3] \n\t"
"preceu.ph.qbla %[temp4], %[temp4] \n\t"
"shll.ph %[temp3], %[temp3], 8 \n\t"
"shll.ph %[temp4], %[temp4], 8 \n\t"
"shra.ph %[temp3], %[temp3], 8 \n\t"
"shra.ph %[temp4], %[temp4], 8 \n\t"
"mul.ph %[temp5], %[temp3], %[temp0] \n\t"
"mul.ph %[temp3], %[temp3], %[temp1] \n\t"
"mul.ph %[temp4], %[temp4], %[temp2] \n\t"
"addiu %[data], %[data], 8 \n\t"
"ins %[new_red1], %[new_red], 16, 16 \n\t"
"ins %[argb1], %[argb], 16, 16 \n\t"
"shra.ph %[temp5], %[temp5], 5 \n\t"
"shra.ph %[temp3], %[temp3], 5 \n\t"
"shra.ph %[temp4], %[temp4], 5 \n\t"
"subu.ph %[new_red1], %[new_red1], %[temp5] \n\t"
"subu.ph %[argb1], %[argb1], %[temp3] \n\t"
"preceu.ph.qbra %[temp5], %[new_red1] \n\t"
"subu.ph %[argb1], %[argb1], %[temp4] \n\t"
"preceu.ph.qbra %[temp3], %[argb1] \n\t"
"sb %[temp5], -2(%[data]) \n\t"
"sb %[temp3], -4(%[data]) \n\t"
"sra %[temp5], %[temp5], 16 \n\t"
"sra %[temp3], %[temp3], 16 \n\t"
"sb %[temp5], -6(%[data]) \n\t"
"bne %[data], %[p_loop_end], 0b \n\t"
" sb %[temp3], -8(%[data]) \n\t"
"1: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[new_red1]"=&r"(new_red1), [new_red]"=&r"(new_red),
[argb]"=&r"(argb), [argb1]"=&r"(argb1), [data]"+&r"(data)
: [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
[G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
: "memory", "hi", "lo"
);
if (num_pixels & 1) {
const uint32_t argb_ = data[0];
const uint32_t green = argb_ >> 8;
const uint32_t red = argb_ >> 16;
uint32_t new_blue = argb_;
new_red = red;
new_red -= ColorTransformDelta(m->green_to_red_, green);
new_red &= 0xff;
new_blue -= ColorTransformDelta(m->green_to_blue_, green);
new_blue -= ColorTransformDelta(m->red_to_blue_, red);
new_blue &= 0xff;
data[0] = (argb_ & 0xff00ff00u) | (new_red << 16) | (new_blue);
}
}
static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
uint8_t red_to_blue,
uint32_t argb) {
const uint32_t green = argb >> 8;
const uint32_t red = argb >> 16;
uint8_t new_blue = argb;
new_blue -= ColorTransformDelta(green_to_blue, green);
new_blue -= ColorTransformDelta(red_to_blue, red);
return (new_blue & 0xff);
}
static void CollectColorBlueTransforms_MIPSdspR2(const uint32_t* argb,
int stride,
int tile_width,
int tile_height,
int green_to_blue,
int red_to_blue,
int histo[]) {
const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
const uint32_t mask = 0xff00ffu;
while (tile_height-- > 0) {
int x;
const uint32_t* p_argb = argb;
argb += stride;
for (x = 0; x < (tile_width >> 1); ++x) {
int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
__asm__ volatile (
"lw %[temp0], 0(%[p_argb]) \n\t"
"lw %[temp1], 4(%[p_argb]) \n\t"
"precr.qb.ph %[temp2], %[temp0], %[temp1] \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"shra.ph %[temp2], %[temp2], 8 \n\t"
"shra.ph %[temp3], %[temp1], 8 \n\t"
"mul.ph %[temp5], %[temp2], %[rtb] \n\t"
"mul.ph %[temp6], %[temp3], %[gtb] \n\t"
"and %[temp4], %[temp1], %[mask] \n\t"
"addiu %[p_argb], %[p_argb], 8 \n\t"
"shra.ph %[temp5], %[temp5], 5 \n\t"
"shra.ph %[temp6], %[temp6], 5 \n\t"
"subu.qb %[temp2], %[temp4], %[temp5] \n\t"
"subu.qb %[temp2], %[temp2], %[temp6] \n\t"
: [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
[temp5]"=&r"(temp5), [temp6]"=&r"(temp6)
: [rtb]"r"(rtb), [gtb]"r"(gtb), [mask]"r"(mask)
: "memory", "hi", "lo"
);
++histo[(uint8_t)(temp2 >> 16)];
++histo[(uint8_t)temp2];
}
if (tile_width & 1) {
++histo[TransformColorBlue(green_to_blue, red_to_blue, *p_argb)];
}
}
}
static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
uint32_t argb) {
const uint32_t green = argb >> 8;
uint32_t new_red = argb >> 16;
new_red -= ColorTransformDelta(green_to_red, green);
return (new_red & 0xff);
}
static void CollectColorRedTransforms_MIPSdspR2(const uint32_t* argb,
int stride,
int tile_width,
int tile_height,
int green_to_red,
int histo[]) {
const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
while (tile_height-- > 0) {
int x;
const uint32_t* p_argb = argb;
argb += stride;
for (x = 0; x < (tile_width >> 1); ++x) {
int temp0, temp1, temp2, temp3, temp4;
__asm__ volatile (
"lw %[temp0], 0(%[p_argb]) \n\t"
"lw %[temp1], 4(%[p_argb]) \n\t"
"precrq.ph.w %[temp4], %[temp0], %[temp1] \n\t"
"ins %[temp1], %[temp0], 16, 16 \n\t"
"shra.ph %[temp3], %[temp1], 8 \n\t"
"mul.ph %[temp2], %[temp3], %[gtr] \n\t"
"addiu %[p_argb], %[p_argb], 8 \n\t"
"shra.ph %[temp2], %[temp2], 5 \n\t"
"subu.qb %[temp2], %[temp4], %[temp2] \n\t"
: [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
: [gtr]"r"(gtr)
: "memory", "hi", "lo"
);
++histo[(uint8_t)(temp2 >> 16)];
++histo[(uint8_t)temp2];
}
if (tile_width & 1) {
++histo[TransformColorRed(green_to_red, *p_argb)];
}
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MIPSdspR2;
VP8LTransformColor = TransformColor_MIPSdspR2;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_MIPSdspR2;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View File

@ -0,0 +1,148 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MSA variant of Image transform methods for lossless encoder.
//
// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MSA)
#include "dsp_lossless.h"
#include "dsp_msa_macro.h"
#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do { \
v8i16 g0, g1, t0, t1, t2, t3; \
v4i32 t4, t5; \
VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1); \
DOTP_SB2_SH(g0, g1, c0, c0, t0, t1); \
SRAI_H2_SH(t0, t1, 5); \
t0 = __msa_subv_h((v8i16)src0, t0); \
t1 = __msa_subv_h((v8i16)src1, t1); \
t4 = __msa_srli_w((v4i32)src0, 16); \
t5 = __msa_srli_w((v4i32)src1, 16); \
DOTP_SB2_SH(t4, t5, c1, c1, t2, t3); \
SRAI_H2_SH(t2, t3, 5); \
SUB2(t0, t2, t1, t3, t0, t1); \
VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1); \
} while (0)
#define TRANSFORM_COLOR_4(src, dst, c0, c1, mask0, mask1) do { \
const v16i8 g0 = VSHF_SB(src, src, mask0); \
v8i16 t0 = __msa_dotp_s_h(c0, g0); \
v8i16 t1; \
v4i32 t2; \
t0 = SRAI_H(t0, 5); \
t0 = __msa_subv_h((v8i16)src, t0); \
t2 = __msa_srli_w((v4i32)src, 16); \
t1 = __msa_dotp_s_h(c1, (v16i8)t2); \
t1 = SRAI_H(t1, 5); \
t0 = t0 - t1; \
dst = VSHF_UB(src, t0, mask1); \
} while (0)
static void TransformColor_MSA(const VP8LMultipliers* const m, uint32_t* data,
int num_pixels) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));
const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
13, 255, 13, 255 };
const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
28, 13, 30, 15 };
while (num_pixels >= 8) {
v16u8 src1, dst1;
LD_UB2(data, 4, src0, src1);
TRANSFORM_COLOR_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
ST_UB2(dst0, dst1, data, 4);
data += 8;
num_pixels -= 8;
}
if (num_pixels > 0) {
if (num_pixels >= 4) {
src0 = LD_UB(data);
TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
ST_UB(dst0, data);
data += 4;
num_pixels -= 4;
}
if (num_pixels > 0) {
src0 = LD_UB(data);
TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
if (num_pixels == 3) {
const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
SD(pix_d, data + 0);
SW(pix_w, data + 2);
} else if (num_pixels == 2) {
const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
SD(pix_d, data);
} else {
const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
SW(pix_w, data);
}
}
}
}
static void SubtractGreenFromBlueAndRed_MSA(uint32_t* argb_data,
int num_pixels) {
int i;
uint8_t* ptemp_data = (uint8_t*)argb_data;
v16u8 src0, dst0, tmp0;
const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
13, 255, 13, 255 };
while (num_pixels >= 8) {
v16u8 src1, dst1, tmp1;
LD_UB2(ptemp_data, 16, src0, src1);
VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
SUB2(src0, tmp0, src1, tmp1, dst0, dst1);
ST_UB2(dst0, dst1, ptemp_data, 16);
ptemp_data += 8 * 4;
num_pixels -= 8;
}
if (num_pixels > 0) {
if (num_pixels >= 4) {
src0 = LD_UB(ptemp_data);
tmp0 = VSHF_UB(src0, src0, mask);
dst0 = src0 - tmp0;
ST_UB(dst0, ptemp_data);
ptemp_data += 4 * 4;
num_pixels -= 4;
}
for (i = 0; i < num_pixels; i++) {
const uint8_t b = ptemp_data[0];
const uint8_t g = ptemp_data[1];
const uint8_t r = ptemp_data[2];
ptemp_data[0] = (b - g) & 0xff;
ptemp_data[2] = (r - g) & 0xff;
ptemp_data += 4;
}
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_MSA;
VP8LTransformColor = TransformColor_MSA;
}
#else // !WEBP_USE_MSA
WEBP_DSP_INIT_STUB(VP8LEncDspInitMSA)
#endif // WEBP_USE_MSA

View File

@ -0,0 +1,144 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON variant of methods for lossless encoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include "dsp_lossless.h"
#include "dsp_neon.h"
//------------------------------------------------------------------------------
// Subtract-Green Transform
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
#define USE_VTBLQ
#endif
#ifdef USE_VTBLQ
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
#else // !USE_VTBLQ
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
static void SubtractGreenFromBlueAndRed_NEON(uint32_t* argb_data,
int num_pixels) {
const uint32_t* const end = argb_data + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
#else
const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
#endif
for (; argb_data < end; argb_data += 4) {
const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
}
//------------------------------------------------------------------------------
// Color Transform
static void TransformColor_NEON(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_)
};
const int16x8_t mults_rb = vld1q_s16(rb);
const int16_t b2[8] = {
0, CST(red_to_blue_), 0, CST(red_to_blue_),
0, CST(red_to_blue_), 0, CST(red_to_blue_),
};
const int16x8_t mults_b2 = vld1q_s16(b2);
#undef CST
#ifdef USE_VTBLQ
static const uint8_t kg0g0[16] = {
255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
};
const uint8x16_t shuffle = vld1q_u8(kg0g0);
#else
static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
const uint8x8_t shuffle = vld1_u8(k0g0g);
#endif
const uint32x4_t mask_rb = vdupq_n_u32(0x00ff00ffu); // red-blue masks
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
// 0 g 0 g
const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// r 0 b 0
const int16x8_t B = vshlq_n_s16(vreinterpretq_s16_u8(in), 8);
// x db2 0 0
const int16x8_t C = vqdmulhq_s16(B, mults_b2);
// 0 0 x db2
const uint32x4_t D = vshrq_n_u32(vreinterpretq_u32_s16(C), 16);
// x dr x db
const int8x16_t E = vaddq_s8(vreinterpretq_s8_u32(D),
vreinterpretq_s8_s16(A));
// 0 dr 0 db
const uint32x4_t F = vandq_u32(vreinterpretq_u32_s8(E), mask_rb);
const int8x16_t out = vsubq_s8(vreinterpretq_s8_u8(in),
vreinterpretq_s8_u32(F));
vst1q_s8((int8_t*)(argb_data + i), out);
}
// fallthrough and finish off with plain-C
VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
}
#undef USE_VTBLQ
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_NEON;
VP8LTransformColor = TransformColor_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8LEncDspInitNEON)
#endif // WEBP_USE_NEON

View File

@ -0,0 +1,696 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 variant of methods for lossless encoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include "dsp_lossless.h"
#include "dsp_common_sse2.h"
#include "dsp_lossless_common.h"
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
int num_pixels) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
const __m128i out = _mm_sub_epi8(in, C);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// fallthrough and finish off with plain-C
if (i != num_pixels) {
VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
}
//------------------------------------------------------------------------------
// Color Transform
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
static void TransformColor_SSE2(const VP8LMultipliers* const m,
uint32_t* argb_data, int num_pixels) {
const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
CST_5b(m->green_to_blue_));
const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0
const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0
const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2
const __m128i H = _mm_add_epi8(G, D); // x dr x db
const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db
const __m128i out = _mm_sub_epi8(in, I);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// fallthrough and finish off with plain-C
if (i != num_pixels) {
VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
}
}
//------------------------------------------------------------------------------
#define SPAN 8
static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0
const __m128i A1 = _mm_slli_epi16(in1, 8);
const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
const __m128i B1 = _mm_and_si128(in1, mask_g);
const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0
const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db
const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b'
const __m128i E1 = _mm_sub_epi8(in1, D1);
const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db
const __m128i F1 = _mm_srli_epi32(C1, 16);
const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b'
const __m128i G1 = _mm_sub_epi8(E1, F1);
const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b
const __m128i H1 = _mm_and_si128(G1, mask_b);
const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b'
_mm_storeu_si128((__m128i*)values, I);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{
const int left_over = tile_width & (SPAN - 1);
if (left_over > 0) {
VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_blue, red_to_blue, histo);
}
}
}
static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask = _mm_set1_epi32(0xff);
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
const __m128i A1 = _mm_and_si128(in1, mask_g);
const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r
const __m128i B1 = _mm_srli_epi32(in1, 16);
const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr
const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r'
const __m128i E1 = _mm_sub_epi8(B1, C1);
const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r'
const __m128i F1 = _mm_and_si128(E1, mask);
const __m128i I = _mm_packs_epi32(F0, F1);
_mm_storeu_si128((__m128i*)values, I);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{
const int left_over = tile_width & (SPAN - 1);
if (left_over > 0) {
VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_red, histo);
}
}
}
#undef SPAN
#undef MK_CST_16
//------------------------------------------------------------------------------
// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
// that's ok since the histogram values are less than 1<<28 (max picture size).
#define LINE_SIZE 16 // 8 or 16
static void AddVector_SSE2(const uint32_t* a, const uint32_t* b, uint32_t* out,
int size) {
int i;
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
#if (LINE_SIZE == 16)
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
#endif
const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i + 4]);
#if (LINE_SIZE == 16)
const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i + 8]);
const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
#endif
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
#if (LINE_SIZE == 16)
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
#endif
}
for (; i < size; ++i) {
out[i] = a[i] + b[i];
}
}
static void AddVectorEq_SSE2(const uint32_t* a, uint32_t* out, int size) {
int i;
for (i = 0; i + LINE_SIZE <= size; i += LINE_SIZE) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i + 0]);
const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i + 4]);
#if (LINE_SIZE == 16)
const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i + 8]);
const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
#endif
const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i + 0]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i + 4]);
#if (LINE_SIZE == 16)
const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i + 8]);
const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
#endif
_mm_storeu_si128((__m128i*)&out[i + 0], _mm_add_epi32(a0, b0));
_mm_storeu_si128((__m128i*)&out[i + 4], _mm_add_epi32(a1, b1));
#if (LINE_SIZE == 16)
_mm_storeu_si128((__m128i*)&out[i + 8], _mm_add_epi32(a2, b2));
_mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
#endif
}
for (; i < size; ++i) {
out[i] += a[i];
}
}
#undef LINE_SIZE
//------------------------------------------------------------------------------
// Entropy
// Checks whether the X or Y contribution is worth computing and adding.
// Used in loop unrolling.
#define ANALYZE_X_OR_Y(x_or_y, j) \
do { \
if ((x_or_y)[i + (j)] != 0) retval -= VP8LFastSLog2((x_or_y)[i + (j)]); \
} while (0)
// Checks whether the X + Y contribution is worth computing and adding.
// Used in loop unrolling.
#define ANALYZE_XY(j) \
do { \
if (tmp[j] != 0) { \
retval -= VP8LFastSLog2(tmp[j]); \
ANALYZE_X_OR_Y(X, j); \
} \
} while (0)
#if !(defined(__i386__) || defined(_M_IX86))
static float CombinedShannonEntropy_SSE2(const int X[256], const int Y[256]) {
int i;
double retval = 0.;
int sumX, sumXY;
int32_t tmp[4];
__m128i zero = _mm_setzero_si128();
// Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
__m128i sumXY_128 = zero;
__m128i sumX_128 = zero;
for (i = 0; i < 256; i += 4) {
const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
// Check if any X is non-zero: this actually provides a speedup as X is
// usually sparse.
if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
const __m128i xy_128 = _mm_add_epi32(x, y);
sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);
sumX_128 = _mm_add_epi32(sumX_128, x);
// Analyze the different X + Y.
_mm_storeu_si128((__m128i*)tmp, xy_128);
ANALYZE_XY(0);
ANALYZE_XY(1);
ANALYZE_XY(2);
ANALYZE_XY(3);
} else {
// X is fully 0, so only deal with Y.
sumXY_128 = _mm_add_epi32(sumXY_128, y);
ANALYZE_X_OR_Y(Y, 0);
ANALYZE_X_OR_Y(Y, 1);
ANALYZE_X_OR_Y(Y, 2);
ANALYZE_X_OR_Y(Y, 3);
}
}
// Sum up sumX_128 to get sumX.
_mm_storeu_si128((__m128i*)tmp, sumX_128);
sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];
// Sum up sumXY_128 to get sumXY.
_mm_storeu_si128((__m128i*)tmp, sumXY_128);
sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];
retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
return (float)retval;
}
#endif // !(defined(__i386__) || defined(_M_IX86))
#undef ANALYZE_X_OR_Y
#undef ANALYZE_XY
//------------------------------------------------------------------------------
static int VectorMismatch_SSE2(const uint32_t* const array1,
const uint32_t* const array2, int length) {
int match_len;
if (length >= 12) {
__m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]);
__m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]);
match_len = 0;
do {
// Loop unrolling and early load both provide a speedup of 10% for the
// current function. Also, max_limit can be MAX_LENGTH=4096 at most.
const __m128i cmpA = _mm_cmpeq_epi32(A0, A1);
const __m128i B0 =
_mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
const __m128i B1 =
_mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
if (_mm_movemask_epi8(cmpA) != 0xffff) break;
match_len += 4;
{
const __m128i cmpB = _mm_cmpeq_epi32(B0, B1);
A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
if (_mm_movemask_epi8(cmpB) != 0xffff) break;
match_len += 4;
}
} while (match_len + 12 < length);
} else {
match_len = 0;
// Unroll the potential first two loops.
if (length >= 4 &&
_mm_movemask_epi8(_mm_cmpeq_epi32(
_mm_loadu_si128((const __m128i*)&array1[0]),
_mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {
match_len = 4;
if (length >= 8 &&
_mm_movemask_epi8(_mm_cmpeq_epi32(
_mm_loadu_si128((const __m128i*)&array1[4]),
_mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
match_len = 8;
}
}
}
while (match_len < length && array1[match_len] == array2[match_len]) {
++match_len;
}
return match_len;
}
// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
uint32_t* dst) {
int x;
assert(xbits >= 0);
assert(xbits <= 3);
switch (xbits) {
case 0: {
const __m128i ff = _mm_set1_epi16((short)0xff00);
const __m128i zero = _mm_setzero_si128();
// Store 0xff000000 | (row[x] << 8).
for (x = 0; x + 16 <= width; x += 16, dst += 16) {
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
const __m128i in_lo = _mm_unpacklo_epi8(zero, in);
const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);
const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);
const __m128i in_hi = _mm_unpackhi_epi8(zero, in);
const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);
const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);
_mm_storeu_si128((__m128i*)&dst[0], dst0);
_mm_storeu_si128((__m128i*)&dst[4], dst1);
_mm_storeu_si128((__m128i*)&dst[8], dst2);
_mm_storeu_si128((__m128i*)&dst[12], dst3);
}
break;
}
case 1: {
const __m128i ff = _mm_set1_epi16((short)0xff00);
const __m128i mul = _mm_set1_epi16(0x110);
for (x = 0; x + 16 <= width; x += 16, dst += 8) {
// 0a0b | (where a/b are 4 bits).
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
const __m128i tmp = _mm_mullo_epi16(in, mul); // aba0
const __m128i pack = _mm_and_si128(tmp, ff); // ab00
const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);
const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);
_mm_storeu_si128((__m128i*)&dst[0], dst0);
_mm_storeu_si128((__m128i*)&dst[4], dst1);
}
break;
}
case 2: {
const __m128i mask_or = _mm_set1_epi32(0xff000000);
const __m128i mul_cst = _mm_set1_epi16(0x0104);
const __m128i mask_mul = _mm_set1_epi16(0x0f00);
for (x = 0; x + 16 <= width; x += 16, dst += 4) {
// 000a000b000c000d | (where a/b/c/d are 2 bits).
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
const __m128i mul = _mm_mullo_epi16(in, mul_cst); // 00ab00b000cd00d0
const __m128i tmp = _mm_and_si128(mul, mask_mul); // 00ab000000cd0000
const __m128i shift = _mm_srli_epi32(tmp, 12); // 00000000ab000000
const __m128i pack = _mm_or_si128(shift, tmp); // 00000000abcd0000
// Convert to 0xff00**00.
const __m128i res = _mm_or_si128(pack, mask_or);
_mm_storeu_si128((__m128i*)dst, res);
}
break;
}
default: {
assert(xbits == 3);
for (x = 0; x + 16 <= width; x += 16, dst += 2) {
// 0000000a00000000b... | (where a/b are 1 bit).
const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
const __m128i shift = _mm_slli_epi64(in, 7);
const uint32_t move = _mm_movemask_epi8(shift);
dst[0] = 0xff000000 | ((move & 0xff) << 8);
dst[1] = 0xff000000 | (move & 0xff00);
}
break;
}
}
if (x != width) {
VP8LBundleColorMap_C(row + x, width - x, xbits, dst);
}
}
//------------------------------------------------------------------------------
// Batch version of Predictor Transform subtraction
static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
const __m128i* const a1,
__m128i* const avg) {
// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
const __m128i ones = _mm_set1_epi8(1);
const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
*avg = _mm_sub_epi8(avg1, one);
}
// Predictor0: ARGB_BLACK.
static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const __m128i black = _mm_set1_epi32(ARGB_BLACK);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i res = _mm_sub_epi8(src, black);
_mm_storeu_si128((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsSub_C[0](in + i, NULL, num_pixels - i, out + i);
}
(void)upper;
}
#define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorSub##X##_SSE2(const uint32_t* const in, \
const uint32_t* const upper, \
int num_pixels, uint32_t* const out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN)); \
const __m128i res = _mm_sub_epi8(src, pred); \
_mm_storeu_si128((__m128i*)&out[i], res); \
} \
if (i != num_pixels) { \
VP8LPredictorsSub_C[(X)](in + i, WEBP_OFFSET_PTR(upper, i), \
num_pixels - i, out + i); \
} \
}
GENERATE_PREDICTOR_1(1, in[i - 1]) // Predictor1: L
GENERATE_PREDICTOR_1(2, upper[i]) // Predictor2: T
GENERATE_PREDICTOR_1(3, upper[i + 1]) // Predictor3: TR
GENERATE_PREDICTOR_1(4, upper[i - 1]) // Predictor4: TL
#undef GENERATE_PREDICTOR_1
// Predictor5: avg2(avg2(L, TR), T)
static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i avg, pred, res;
Average2_m128i(&L, &TR, &avg);
Average2_m128i(&avg, &T, &pred);
res = _mm_sub_epi8(src, pred);
_mm_storeu_si128((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i);
}
}
#define GENERATE_PREDICTOR_2(X, A, B) \
static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i tA = _mm_loadu_si128((const __m128i*)&(A)); \
const __m128i tB = _mm_loadu_si128((const __m128i*)&(B)); \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
__m128i pred, res; \
Average2_m128i(&tA, &tB, &pred); \
res = _mm_sub_epi8(src, pred); \
_mm_storeu_si128((__m128i*)&out[i], res); \
} \
if (i != num_pixels) { \
VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
} \
}
GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1]) // Predictor6: avg(L, TL)
GENERATE_PREDICTOR_2(7, in[i - 1], upper[i]) // Predictor7: avg(L, T)
GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i]) // Predictor8: avg(TL, T)
GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1]) // Predictor9: average(T, TR)
#undef GENERATE_PREDICTOR_2
// Predictor10: avg(avg(L,TL), avg(T, TR)).
static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
__m128i avgTTR, avgLTL, avg, res;
Average2_m128i(&T, &TR, &avgTTR);
Average2_m128i(&L, &TL, &avgLTL);
Average2_m128i(&avgTTR, &avgLTL, &avg);
res = _mm_sub_epi8(src, avg);
_mm_storeu_si128((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i);
}
}
// Predictor11: select.
static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
__m128i* const out) {
// We can unpack with any value on the upper 32 bits, provided it's the same
// on both operands (to that their sum of abs diff is zero). Here we use *A.
const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
*out = _mm_packs_epi32(s_lo, s_hi);
}
static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i pa, pb;
GetSumAbsDiff32_SSE2(&T, &TL, &pa); // pa = sum |T-TL|
GetSumAbsDiff32_SSE2(&L, &TL, &pb); // pb = sum |L-TL|
{
const __m128i mask = _mm_cmpgt_epi32(pb, pa);
const __m128i A = _mm_and_si128(mask, L);
const __m128i B = _mm_andnot_si128(mask, T);
const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T
const __m128i res = _mm_sub_epi8(src, pred);
_mm_storeu_si128((__m128i*)&out[i], res);
}
}
if (i != num_pixels) {
VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
}
}
// Predictor12: ClampedSubSubtractFull.
static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo);
const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi);
const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
const __m128i res = _mm_sub_epi8(src, pred);
_mm_storeu_si128((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i);
}
}
// Predictors13: ClampedAddSubtractHalf
static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 2 <= num_pixels; i += 2) {
// we can only process two pixels at a time
const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
const __m128i sum = _mm_add_epi16(T_lo, L_lo);
const __m128i avg = _mm_srli_epi16(sum, 1);
const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
const __m128i A3 = _mm_srai_epi16(A2, 1);
const __m128i A4 = _mm_add_epi16(avg, A3);
const __m128i pred = _mm_packus_epi16(A4, A4);
const __m128i res = _mm_sub_epi8(src, pred);
_mm_storel_epi64((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE2;
VP8LTransformColor = TransformColor_SSE2;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE2;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE2;
VP8LAddVector = AddVector_SSE2;
VP8LAddVectorEq = AddVectorEq_SSE2;
// TODO(https://crbug.com/webp/499): this function produces different results
// from the C code due to use of double/float resulting in output differences
// when compared to -noasm.
#if !(defined(__i386__) || defined(_M_IX86))
VP8LCombinedShannonEntropy = CombinedShannonEntropy_SSE2;
#endif
VP8LVectorMismatch = VectorMismatch_SSE2;
VP8LBundleColorMap = BundleColorMap_SSE2;
VP8LPredictorsSub[0] = PredictorSub0_SSE2;
VP8LPredictorsSub[1] = PredictorSub1_SSE2;
VP8LPredictorsSub[2] = PredictorSub2_SSE2;
VP8LPredictorsSub[3] = PredictorSub3_SSE2;
VP8LPredictorsSub[4] = PredictorSub4_SSE2;
VP8LPredictorsSub[5] = PredictorSub5_SSE2;
VP8LPredictorsSub[6] = PredictorSub6_SSE2;
VP8LPredictorsSub[7] = PredictorSub7_SSE2;
VP8LPredictorsSub[8] = PredictorSub8_SSE2;
VP8LPredictorsSub[9] = PredictorSub9_SSE2;
VP8LPredictorsSub[10] = PredictorSub10_SSE2;
VP8LPredictorsSub[11] = PredictorSub11_SSE2;
VP8LPredictorsSub[12] = PredictorSub12_SSE2;
VP8LPredictorsSub[13] = PredictorSub13_SSE2;
VP8LPredictorsSub[14] = PredictorSub0_SSE2; // <- padding security sentinels
VP8LPredictorsSub[15] = PredictorSub0_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)
#endif // WEBP_USE_SSE2

View File

@ -0,0 +1,148 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE4.1 variant of methods for lossless encoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE41)
#include <assert.h>
#include <smmintrin.h>
#include "dsp_lossless.h"
// For sign-extended multiplying constants, pre-shifted by 5:
#define CST_5b(X) (((int16_t)((uint16_t)(X) << 8)) >> 5)
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
int num_pixels) {
int i;
const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
-1, 5, -1, 5, -1, 1, -1, 1);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle);
const __m128i out = _mm_sub_epi8(in, in_0g0g);
_mm_storeu_si128((__m128i*)&argb_data[i], out);
}
// fallthrough and finish off with plain-C
if (i != num_pixels) {
VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
}
//------------------------------------------------------------------------------
// Color Transform
#define SPAN 8
static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_blue, int red_to_blue,
int histo[]) {
const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue));
const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue));
const __m128i mask_g = _mm_set1_epi16((short)0xff00); // green mask
const __m128i mask_gb = _mm_set1_epi32(0xffff); // green/blue mask
const __m128i mask_b = _mm_set1_epi16(0x00ff); // blue mask
const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1,
-1, -1, -1, -1, -1, -1, -1);
const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1,
2, -1, 6, -1, 10, -1, 14);
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo);
const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi);
const __m128i r = _mm_or_si128(r0, r1); // r 0
const __m128i gb0 = _mm_and_si128(in0, mask_gb);
const __m128i gb1 = _mm_and_si128(in1, mask_gb);
const __m128i gb = _mm_packus_epi32(gb0, gb1); // g b
const __m128i g = _mm_and_si128(gb, mask_g); // g 0
const __m128i A = _mm_mulhi_epi16(r, mults_r); // x dbr
const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dbg
const __m128i C = _mm_sub_epi8(gb, B); // x b'
const __m128i D = _mm_sub_epi8(C, A); // x b''
const __m128i E = _mm_and_si128(D, mask_b); // 0 b''
_mm_storeu_si128((__m128i*)values, E);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{
const int left_over = tile_width & (SPAN - 1);
if (left_over > 0) {
VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height,
green_to_blue, red_to_blue, histo);
}
}
}
static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
int tile_width, int tile_height,
int green_to_red, int histo[]) {
const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red));
const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask
const __m128i mask = _mm_set1_epi16(0xff);
int y;
for (y = 0; y < tile_height; ++y) {
const uint32_t* const src = argb + y * stride;
int i, x;
for (x = 0; x + SPAN <= tile_width; x += SPAN) {
uint16_t values[SPAN];
const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
const __m128i g0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0
const __m128i g1 = _mm_and_si128(in1, mask_g);
const __m128i g = _mm_packus_epi32(g0, g1); // g 0
const __m128i A0 = _mm_srli_epi32(in0, 16); // 0 0 | x r
const __m128i A1 = _mm_srli_epi32(in1, 16);
const __m128i A = _mm_packus_epi32(A0, A1); // x r
const __m128i B = _mm_mulhi_epi16(g, mults_g); // x dr
const __m128i C = _mm_sub_epi8(A, B); // x r'
const __m128i D = _mm_and_si128(C, mask); // 0 r'
_mm_storeu_si128((__m128i*)values, D);
for (i = 0; i < SPAN; ++i) ++histo[values[i]];
}
}
{
const int left_over = tile_width & (SPAN - 1);
if (left_over > 0) {
VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
left_over, tile_height, green_to_red,
histo);
}
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LEncDspInitSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
}
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41)
#endif // WEBP_USE_SSE41

View File

@ -0,0 +1,696 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Image transforms and color space conversion methods for lossless decoder.
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "dsp_lossless.h"
#include "dsp_lossless_common.h"
#define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE) \
static void FUNC_NAME(const TYPE* src, \
const uint32_t* const color_map, \
TYPE* dst, int y_start, int y_end, \
int width) { \
int y; \
for (y = y_start; y < y_end; ++y) { \
int x; \
for (x = 0; x < (width >> 2); ++x) { \
int tmp1, tmp2, tmp3, tmp4; \
__asm__ volatile ( \
".ifc " #TYPE ", uint8_t \n\t" \
"lbu %[tmp1], 0(%[src]) \n\t" \
"lbu %[tmp2], 1(%[src]) \n\t" \
"lbu %[tmp3], 2(%[src]) \n\t" \
"lbu %[tmp4], 3(%[src]) \n\t" \
"addiu %[src], %[src], 4 \n\t" \
".endif \n\t" \
".ifc " #TYPE ", uint32_t \n\t" \
"lw %[tmp1], 0(%[src]) \n\t" \
"lw %[tmp2], 4(%[src]) \n\t" \
"lw %[tmp3], 8(%[src]) \n\t" \
"lw %[tmp4], 12(%[src]) \n\t" \
"ext %[tmp1], %[tmp1], 8, 8 \n\t" \
"ext %[tmp2], %[tmp2], 8, 8 \n\t" \
"ext %[tmp3], %[tmp3], 8, 8 \n\t" \
"ext %[tmp4], %[tmp4], 8, 8 \n\t" \
"addiu %[src], %[src], 16 \n\t" \
".endif \n\t" \
"sll %[tmp1], %[tmp1], 2 \n\t" \
"sll %[tmp2], %[tmp2], 2 \n\t" \
"sll %[tmp3], %[tmp3], 2 \n\t" \
"sll %[tmp4], %[tmp4], 2 \n\t" \
"lwx %[tmp1], %[tmp1](%[color_map]) \n\t" \
"lwx %[tmp2], %[tmp2](%[color_map]) \n\t" \
"lwx %[tmp3], %[tmp3](%[color_map]) \n\t" \
"lwx %[tmp4], %[tmp4](%[color_map]) \n\t" \
".ifc " #TYPE ", uint8_t \n\t" \
"ext %[tmp1], %[tmp1], 8, 8 \n\t" \
"ext %[tmp2], %[tmp2], 8, 8 \n\t" \
"ext %[tmp3], %[tmp3], 8, 8 \n\t" \
"ext %[tmp4], %[tmp4], 8, 8 \n\t" \
"sb %[tmp1], 0(%[dst]) \n\t" \
"sb %[tmp2], 1(%[dst]) \n\t" \
"sb %[tmp3], 2(%[dst]) \n\t" \
"sb %[tmp4], 3(%[dst]) \n\t" \
"addiu %[dst], %[dst], 4 \n\t" \
".endif \n\t" \
".ifc " #TYPE ", uint32_t \n\t" \
"sw %[tmp1], 0(%[dst]) \n\t" \
"sw %[tmp2], 4(%[dst]) \n\t" \
"sw %[tmp3], 8(%[dst]) \n\t" \
"sw %[tmp4], 12(%[dst]) \n\t" \
"addiu %[dst], %[dst], 16 \n\t" \
".endif \n\t" \
: [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3), \
[tmp4]"=&r"(tmp4), [src]"+&r"(src), [dst]"+r"(dst) \
: [color_map]"r"(color_map) \
: "memory" \
); \
} \
for (x = 0; x < (width & 3); ++x) { \
*dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]); \
} \
} \
}
MAP_COLOR_FUNCS(MapARGB_MIPSdspR2, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
MAP_COLOR_FUNCS(MapAlpha_MIPSdspR2, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
#undef MAP_COLOR_FUNCS
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
uint32_t c2) {
int temp0, temp1, temp2, temp3, temp4, temp5;
__asm__ volatile (
"preceu.ph.qbr %[temp1], %[c0] \n\t"
"preceu.ph.qbl %[temp2], %[c0] \n\t"
"preceu.ph.qbr %[temp3], %[c1] \n\t"
"preceu.ph.qbl %[temp4], %[c1] \n\t"
"preceu.ph.qbr %[temp5], %[c2] \n\t"
"preceu.ph.qbl %[temp0], %[c2] \n\t"
"subq.ph %[temp3], %[temp3], %[temp5] \n\t"
"subq.ph %[temp4], %[temp4], %[temp0] \n\t"
"addq.ph %[temp1], %[temp1], %[temp3] \n\t"
"addq.ph %[temp2], %[temp2], %[temp4] \n\t"
"shll_s.ph %[temp1], %[temp1], 7 \n\t"
"shll_s.ph %[temp2], %[temp2], 7 \n\t"
"precrqu_s.qb.ph %[temp2], %[temp2], %[temp1] \n\t"
: [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5)
: [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
: "memory"
);
return temp2;
}
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
uint32_t c2) {
int temp0, temp1, temp2, temp3, temp4, temp5;
__asm__ volatile (
"adduh.qb %[temp5], %[c0], %[c1] \n\t"
"preceu.ph.qbr %[temp3], %[c2] \n\t"
"preceu.ph.qbr %[temp1], %[temp5] \n\t"
"preceu.ph.qbl %[temp2], %[temp5] \n\t"
"preceu.ph.qbl %[temp4], %[c2] \n\t"
"subq.ph %[temp3], %[temp1], %[temp3] \n\t"
"subq.ph %[temp4], %[temp2], %[temp4] \n\t"
"shrl.ph %[temp5], %[temp3], 15 \n\t"
"shrl.ph %[temp0], %[temp4], 15 \n\t"
"addq.ph %[temp3], %[temp3], %[temp5] \n\t"
"addq.ph %[temp4], %[temp0], %[temp4] \n\t"
"shra.ph %[temp3], %[temp3], 1 \n\t"
"shra.ph %[temp4], %[temp4], 1 \n\t"
"addq.ph %[temp1], %[temp1], %[temp3] \n\t"
"addq.ph %[temp2], %[temp2], %[temp4] \n\t"
"shll_s.ph %[temp1], %[temp1], 7 \n\t"
"shll_s.ph %[temp2], %[temp2], 7 \n\t"
"precrqu_s.qb.ph %[temp1], %[temp2], %[temp1] \n\t"
: [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=r"(temp4), [temp5]"=&r"(temp5)
: [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
: "memory"
);
return temp1;
}
static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
int temp0, temp1, temp2, temp3, temp4, temp5;
__asm__ volatile (
"cmpgdu.lt.qb %[temp1], %[c], %[b] \n\t"
"pick.qb %[temp1], %[b], %[c] \n\t"
"pick.qb %[temp2], %[c], %[b] \n\t"
"cmpgdu.lt.qb %[temp4], %[c], %[a] \n\t"
"pick.qb %[temp4], %[a], %[c] \n\t"
"pick.qb %[temp5], %[c], %[a] \n\t"
"subu.qb %[temp3], %[temp1], %[temp2] \n\t"
"subu.qb %[temp0], %[temp4], %[temp5] \n\t"
"raddu.w.qb %[temp3], %[temp3] \n\t"
"raddu.w.qb %[temp0], %[temp0] \n\t"
"subu %[temp3], %[temp3], %[temp0] \n\t"
"slti %[temp0], %[temp3], 0x1 \n\t"
"movz %[a], %[b], %[temp0] \n\t"
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp0]"=&r"(temp0),
[a]"+&r"(a)
: [b]"r"(b), [c]"r"(c)
);
return a;
}
static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
__asm__ volatile (
"adduh.qb %[a0], %[a0], %[a1] \n\t"
: [a0]"+r"(a0)
: [a1]"r"(a1)
);
return a0;
}
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
return Average2(Average2(a0, a2), a1);
}
static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
uint32_t a2, uint32_t a3) {
return Average2(Average2(a0, a1), Average2(a2, a3));
}
static uint32_t Predictor5_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average3(left, top[0], top[1]);
}
static uint32_t Predictor6_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average2(left, top[-1]);
}
static uint32_t Predictor7_MIPSdspR2(uint32_t left, const uint32_t* const top) {
return Average2(left, top[0]);
}
static uint32_t Predictor8_MIPSdspR2(uint32_t left, const uint32_t* const top) {
(void)left;
return Average2(top[-1], top[0]);
}
static uint32_t Predictor9_MIPSdspR2(uint32_t left, const uint32_t* const top) {
(void)left;
return Average2(top[0], top[1]);
}
static uint32_t Predictor10_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return Average4(left, top[-1], top[0], top[1]);
}
static uint32_t Predictor11_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return Select(top[0], left, top[-1]);
}
static uint32_t Predictor12_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return ClampedAddSubtractFull(left, top[0], top[-1]);
}
static uint32_t Predictor13_MIPSdspR2(uint32_t left,
const uint32_t* const top) {
return ClampedAddSubtractHalf(left, top[0], top[-1]);
}
// Add green to blue and red channels (i.e. perform the inverse transform of
// 'subtract green').
static void AddGreenToBlueAndRed_MIPSdspR2(const uint32_t* src, int num_pixels,
uint32_t* dst) {
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"ext %[temp5], %[temp1], 8, 8 \n\t"
"ext %[temp6], %[temp2], 8, 8 \n\t"
"ext %[temp7], %[temp3], 8, 8 \n\t"
"addiu %[src], %[src], 16 \n\t"
"addiu %[dst], %[dst], 16 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"replv.ph %[temp5], %[temp5] \n\t"
"replv.ph %[temp6], %[temp6] \n\t"
"replv.ph %[temp7], %[temp7] \n\t"
"addu.qb %[temp0], %[temp0], %[temp4] \n\t"
"addu.qb %[temp1], %[temp1], %[temp5] \n\t"
"addu.qb %[temp2], %[temp2], %[temp6] \n\t"
"addu.qb %[temp3], %[temp3], %[temp7] \n\t"
"sw %[temp0], -16(%[dst]) \n\t"
"sw %[temp1], -12(%[dst]) \n\t"
"sw %[temp2], -8(%[dst]) \n\t"
"bne %[src], %[p_loop1_end], 0b \n\t"
" sw %[temp3], -4(%[dst]) \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"addiu %[src], %[src], 4 \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"ext %[temp4], %[temp0], 8, 8 \n\t"
"replv.ph %[temp4], %[temp4] \n\t"
"addu.qb %[temp0], %[temp0], %[temp4] \n\t"
"bne %[src], %[p_loop2_end], 1b \n\t"
" sw %[temp0], -4(%[dst]) \n\t"
"2: \n\t"
".set pop \n\t"
: [dst]"+&r"(dst), [src]"+&r"(src), [temp0]"=&r"(temp0),
[temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
[temp7]"=&r"(temp7)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void TransformColorInverse_MIPSdspR2(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
uint32_t argb, argb1, new_red;
const uint32_t G_to_R = m->green_to_red_;
const uint32_t G_to_B = m->green_to_blue_;
const uint32_t R_to_B = m->red_to_blue_;
const uint32_t* const p_loop_end = src + (num_pixels & ~1);
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop_end], 1f \n\t"
" nop \n\t"
"replv.ph %[temp0], %[G_to_R] \n\t"
"replv.ph %[temp1], %[G_to_B] \n\t"
"replv.ph %[temp2], %[R_to_B] \n\t"
"shll.ph %[temp0], %[temp0], 8 \n\t"
"shll.ph %[temp1], %[temp1], 8 \n\t"
"shll.ph %[temp2], %[temp2], 8 \n\t"
"shra.ph %[temp0], %[temp0], 8 \n\t"
"shra.ph %[temp1], %[temp1], 8 \n\t"
"shra.ph %[temp2], %[temp2], 8 \n\t"
"0: \n\t"
"lw %[argb], 0(%[src]) \n\t"
"lw %[argb1], 4(%[src]) \n\t"
"sw %[argb], 0(%[dst]) \n\t"
"sw %[argb1], 4(%[dst]) \n\t"
"addiu %[src], %[src], 8 \n\t"
"addiu %[dst], %[dst], 8 \n\t"
"precrq.qb.ph %[temp3], %[argb], %[argb1] \n\t"
"preceu.ph.qbra %[temp3], %[temp3] \n\t"
"shll.ph %[temp3], %[temp3], 8 \n\t"
"shra.ph %[temp3], %[temp3], 8 \n\t"
"mul.ph %[temp5], %[temp3], %[temp0] \n\t"
"mul.ph %[temp3], %[temp3], %[temp1] \n\t"
"precrq.ph.w %[new_red], %[argb], %[argb1] \n\t"
"ins %[argb1], %[argb], 16, 16 \n\t"
"shra.ph %[temp5], %[temp5], 5 \n\t"
"shra.ph %[temp3], %[temp3], 5 \n\t"
"addu.ph %[new_red], %[new_red], %[temp5] \n\t"
"addu.ph %[argb1], %[argb1], %[temp3] \n\t"
"preceu.ph.qbra %[temp5], %[new_red] \n\t"
"shll.ph %[temp4], %[temp5], 8 \n\t"
"shra.ph %[temp4], %[temp4], 8 \n\t"
"mul.ph %[temp4], %[temp4], %[temp2] \n\t"
"sb %[temp5], -2(%[dst]) \n\t"
"sra %[temp5], %[temp5], 16 \n\t"
"shra.ph %[temp4], %[temp4], 5 \n\t"
"addu.ph %[argb1], %[argb1], %[temp4] \n\t"
"preceu.ph.qbra %[temp3], %[argb1] \n\t"
"sb %[temp5], -6(%[dst]) \n\t"
"sb %[temp3], -4(%[dst]) \n\t"
"sra %[temp3], %[temp3], 16 \n\t"
"bne %[src], %[p_loop_end], 0b \n\t"
" sb %[temp3], -8(%[dst]) \n\t"
"1: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[new_red]"=&r"(new_red), [argb]"=&r"(argb),
[argb1]"=&r"(argb1), [dst]"+&r"(dst), [src]"+&r"(src)
: [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
[G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
: "memory", "hi", "lo"
);
// Fall-back to C-version for left-overs.
if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
}
static void ConvertBGRAToRGB_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"ins %[temp3], %[temp2], 24, 8 \n\t"
"sll %[temp2], %[temp2], 8 \n\t"
"rotr %[temp3], %[temp3], 16 \n\t"
"ins %[temp2], %[temp1], 0, 16 \n\t"
"sll %[temp1], %[temp1], 8 \n\t"
"wsbh %[temp3], %[temp3] \n\t"
"balign %[temp0], %[temp1], 1 \n\t"
"wsbh %[temp2], %[temp2] \n\t"
"wsbh %[temp0], %[temp0] \n\t"
"usw %[temp3], 8(%[dst]) \n\t"
"rotr %[temp0], %[temp0], 16 \n\t"
"usw %[temp2], 4(%[dst]) \n\t"
"addiu %[src], %[src], 16 \n\t"
"usw %[temp0], 0(%[dst]) \n\t"
"bne %[src], %[p_loop1_end], 0b \n\t"
" addiu %[dst], %[dst], 12 \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"addiu %[src], %[src], 4 \n\t"
"wsbh %[temp1], %[temp0] \n\t"
"addiu %[dst], %[dst], 3 \n\t"
"ush %[temp1], -2(%[dst]) \n\t"
"sra %[temp0], %[temp0], 16 \n\t"
"bne %[src], %[p_loop2_end], 1b \n\t"
" sb %[temp0], -3(%[dst]) \n\t"
"2: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void ConvertBGRAToRGBA_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"wsbh %[temp0], %[temp0] \n\t"
"wsbh %[temp1], %[temp1] \n\t"
"wsbh %[temp2], %[temp2] \n\t"
"wsbh %[temp3], %[temp3] \n\t"
"addiu %[src], %[src], 16 \n\t"
"balign %[temp0], %[temp0], 1 \n\t"
"balign %[temp1], %[temp1], 1 \n\t"
"balign %[temp2], %[temp2], 1 \n\t"
"balign %[temp3], %[temp3], 1 \n\t"
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp1], 4(%[dst]) \n\t"
"usw %[temp2], 8(%[dst]) \n\t"
"usw %[temp3], 12(%[dst]) \n\t"
"bne %[src], %[p_loop1_end], 0b \n\t"
" addiu %[dst], %[dst], 16 \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"wsbh %[temp0], %[temp0] \n\t"
"addiu %[src], %[src], 4 \n\t"
"balign %[temp0], %[temp0], 1 \n\t"
"usw %[temp0], 0(%[dst]) \n\t"
"bne %[src], %[p_loop2_end], 1b \n\t"
" addiu %[dst], %[dst], 4 \n\t"
"2: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void ConvertBGRAToRGBA4444_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"ext %[temp4], %[temp0], 28, 4 \n\t"
"ext %[temp5], %[temp0], 12, 4 \n\t"
"ins %[temp0], %[temp4], 0, 4 \n\t"
"ext %[temp4], %[temp1], 28, 4 \n\t"
"ins %[temp0], %[temp5], 16, 4 \n\t"
"ext %[temp5], %[temp1], 12, 4 \n\t"
"ins %[temp1], %[temp4], 0, 4 \n\t"
"ext %[temp4], %[temp2], 28, 4 \n\t"
"ins %[temp1], %[temp5], 16, 4 \n\t"
"ext %[temp5], %[temp2], 12, 4 \n\t"
"ins %[temp2], %[temp4], 0, 4 \n\t"
"ext %[temp4], %[temp3], 28, 4 \n\t"
"ins %[temp2], %[temp5], 16, 4 \n\t"
"ext %[temp5], %[temp3], 12, 4 \n\t"
"ins %[temp3], %[temp4], 0, 4 \n\t"
"precr.qb.ph %[temp1], %[temp1], %[temp0] \n\t"
"ins %[temp3], %[temp5], 16, 4 \n\t"
"addiu %[src], %[src], 16 \n\t"
"precr.qb.ph %[temp3], %[temp3], %[temp2] \n\t"
#if (WEBP_SWAP_16BIT_CSP == 1)
"usw %[temp1], 0(%[dst]) \n\t"
"usw %[temp3], 4(%[dst]) \n\t"
#else
"wsbh %[temp1], %[temp1] \n\t"
"wsbh %[temp3], %[temp3] \n\t"
"usw %[temp1], 0(%[dst]) \n\t"
"usw %[temp3], 4(%[dst]) \n\t"
#endif
"bne %[src], %[p_loop1_end], 0b \n\t"
" addiu %[dst], %[dst], 8 \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"ext %[temp4], %[temp0], 28, 4 \n\t"
"ext %[temp5], %[temp0], 12, 4 \n\t"
"ins %[temp0], %[temp4], 0, 4 \n\t"
"ins %[temp0], %[temp5], 16, 4 \n\t"
"addiu %[src], %[src], 4 \n\t"
"precr.qb.ph %[temp0], %[temp0], %[temp0] \n\t"
#if (WEBP_SWAP_16BIT_CSP == 1)
"ush %[temp0], 0(%[dst]) \n\t"
#else
"wsbh %[temp0], %[temp0] \n\t"
"ush %[temp0], 0(%[dst]) \n\t"
#endif
"bne %[src], %[p_loop2_end], 1b \n\t"
" addiu %[dst], %[dst], 2 \n\t"
"2: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[dst]"+&r"(dst), [src]"+&r"(src)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void ConvertBGRAToRGB565_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3, temp4, temp5;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"ext %[temp4], %[temp0], 8, 16 \n\t"
"ext %[temp5], %[temp0], 5, 11 \n\t"
"ext %[temp0], %[temp0], 3, 5 \n\t"
"ins %[temp4], %[temp5], 0, 11 \n\t"
"ext %[temp5], %[temp1], 5, 11 \n\t"
"ins %[temp4], %[temp0], 0, 5 \n\t"
"ext %[temp0], %[temp1], 8, 16 \n\t"
"ext %[temp1], %[temp1], 3, 5 \n\t"
"ins %[temp0], %[temp5], 0, 11 \n\t"
"ext %[temp5], %[temp2], 5, 11 \n\t"
"ins %[temp0], %[temp1], 0, 5 \n\t"
"ext %[temp1], %[temp2], 8, 16 \n\t"
"ext %[temp2], %[temp2], 3, 5 \n\t"
"ins %[temp1], %[temp5], 0, 11 \n\t"
"ext %[temp5], %[temp3], 5, 11 \n\t"
"ins %[temp1], %[temp2], 0, 5 \n\t"
"ext %[temp2], %[temp3], 8, 16 \n\t"
"ext %[temp3], %[temp3], 3, 5 \n\t"
"ins %[temp2], %[temp5], 0, 11 \n\t"
"append %[temp0], %[temp4], 16 \n\t"
"ins %[temp2], %[temp3], 0, 5 \n\t"
"addiu %[src], %[src], 16 \n\t"
"append %[temp2], %[temp1], 16 \n\t"
#if (WEBP_SWAP_16BIT_CSP == 1)
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp2], 4(%[dst]) \n\t"
#else
"wsbh %[temp0], %[temp0] \n\t"
"wsbh %[temp2], %[temp2] \n\t"
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp2], 4(%[dst]) \n\t"
#endif
"bne %[src], %[p_loop1_end], 0b \n\t"
" addiu %[dst], %[dst], 8 \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"ext %[temp4], %[temp0], 8, 16 \n\t"
"ext %[temp5], %[temp0], 5, 11 \n\t"
"ext %[temp0], %[temp0], 3, 5 \n\t"
"ins %[temp4], %[temp5], 0, 11 \n\t"
"addiu %[src], %[src], 4 \n\t"
"ins %[temp4], %[temp0], 0, 5 \n\t"
#if (WEBP_SWAP_16BIT_CSP == 1)
"ush %[temp4], 0(%[dst]) \n\t"
#else
"wsbh %[temp4], %[temp4] \n\t"
"ush %[temp4], 0(%[dst]) \n\t"
#endif
"bne %[src], %[p_loop2_end], 1b \n\t"
" addiu %[dst], %[dst], 2 \n\t"
"2: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
[dst]"+&r"(dst), [src]"+&r"(src)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
static void ConvertBGRAToBGR_MIPSdspR2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int temp0, temp1, temp2, temp3;
const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
const uint32_t* const p_loop2_end = src + num_pixels;
__asm__ volatile (
".set push \n\t"
".set noreorder \n\t"
"beq %[src], %[p_loop1_end], 3f \n\t"
" nop \n\t"
"0: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"lw %[temp1], 4(%[src]) \n\t"
"lw %[temp2], 8(%[src]) \n\t"
"lw %[temp3], 12(%[src]) \n\t"
"ins %[temp0], %[temp1], 24, 8 \n\t"
"sra %[temp1], %[temp1], 8 \n\t"
"ins %[temp1], %[temp2], 16, 16 \n\t"
"sll %[temp2], %[temp2], 8 \n\t"
"balign %[temp3], %[temp2], 1 \n\t"
"addiu %[src], %[src], 16 \n\t"
"usw %[temp0], 0(%[dst]) \n\t"
"usw %[temp1], 4(%[dst]) \n\t"
"usw %[temp3], 8(%[dst]) \n\t"
"bne %[src], %[p_loop1_end], 0b \n\t"
" addiu %[dst], %[dst], 12 \n\t"
"3: \n\t"
"beq %[src], %[p_loop2_end], 2f \n\t"
" nop \n\t"
"1: \n\t"
"lw %[temp0], 0(%[src]) \n\t"
"addiu %[src], %[src], 4 \n\t"
"addiu %[dst], %[dst], 3 \n\t"
"ush %[temp0], -3(%[dst]) \n\t"
"sra %[temp0], %[temp0], 16 \n\t"
"bne %[src], %[p_loop2_end], 1b \n\t"
" sb %[temp0], -1(%[dst]) \n\t"
"2: \n\t"
".set pop \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
[temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
: [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
: "memory"
);
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
VP8LMapColor32b = MapARGB_MIPSdspR2;
VP8LMapColor8b = MapAlpha_MIPSdspR2;
VP8LPredictors[5] = Predictor5_MIPSdspR2;
VP8LPredictors[6] = Predictor6_MIPSdspR2;
VP8LPredictors[7] = Predictor7_MIPSdspR2;
VP8LPredictors[8] = Predictor8_MIPSdspR2;
VP8LPredictors[9] = Predictor9_MIPSdspR2;
VP8LPredictors[10] = Predictor10_MIPSdspR2;
VP8LPredictors[11] = Predictor11_MIPSdspR2;
VP8LPredictors[12] = Predictor12_MIPSdspR2;
VP8LPredictors[13] = Predictor13_MIPSdspR2;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MIPSdspR2;
VP8LTransformColorInverse = TransformColorInverse_MIPSdspR2;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MIPSdspR2;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MIPSdspR2;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_MIPSdspR2;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_MIPSdspR2;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(VP8LDspInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View File

@ -0,0 +1,356 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MSA variant of methods for lossless decoder
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MSA)
#include "dsp_lossless.h"
#include "dsp_msa_macro.h"
//------------------------------------------------------------------------------
// Colorspace conversion functions
#define CONVERT16_BGRA_XXX(psrc, pdst, m0, m1, m2) do { \
v16u8 src0, src1, src2, src3, dst0, dst1, dst2; \
LD_UB4(psrc, 16, src0, src1, src2, src3); \
VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \
dst2 = VSHF_UB(src2, src3, m2); \
ST_UB2(dst0, dst1, pdst, 16); \
ST_UB(dst2, pdst + 32); \
} while (0)
#define CONVERT12_BGRA_XXX(psrc, pdst, m0, m1, m2) do { \
uint32_t pix_w; \
v16u8 src0, src1, src2, dst0, dst1, dst2; \
LD_UB3(psrc, 16, src0, src1, src2); \
VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \
dst2 = VSHF_UB(src2, src2, m2); \
ST_UB2(dst0, dst1, pdst, 16); \
pix_w = __msa_copy_s_w((v4i32)dst2, 0); \
SW(pix_w, pdst + 32); \
} while (0)
#define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do { \
uint64_t pix_d; \
v16u8 src0, src1, src2 = { 0 }, dst0, dst1; \
LD_UB2(psrc, 16, src0, src1); \
VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1); \
ST_UB(dst0, pdst); \
pix_d = __msa_copy_s_d((v2i64)dst1, 0); \
SD(pix_d, pdst + 16); \
} while (0)
#define CONVERT4_BGRA_XXX(psrc, pdst, m) do { \
const v16u8 src0 = LD_UB(psrc); \
const v16u8 dst0 = VSHF_UB(src0, src0, m); \
uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0); \
uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2); \
SD(pix_d, pdst + 0); \
SW(pix_w, pdst + 8); \
} while (0)
#define CONVERT1_BGRA_BGR(psrc, pdst) do { \
const int32_t b = (psrc)[0]; \
const int32_t g = (psrc)[1]; \
const int32_t r = (psrc)[2]; \
(pdst)[0] = b; \
(pdst)[1] = g; \
(pdst)[2] = r; \
} while (0)
#define CONVERT1_BGRA_RGB(psrc, pdst) do { \
const int32_t b = (psrc)[0]; \
const int32_t g = (psrc)[1]; \
const int32_t r = (psrc)[2]; \
(pdst)[0] = r; \
(pdst)[1] = g; \
(pdst)[2] = b; \
} while (0)
#define TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, \
c0, c1, mask0, mask1) do { \
v8i16 g0, g1, t0, t1, t2, t3; \
v4i32 t4, t5; \
VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1); \
DOTP_SB2_SH(g0, g1, c0, c0, t0, t1); \
SRAI_H2_SH(t0, t1, 5); \
t0 = __msa_addv_h(t0, (v8i16)src0); \
t1 = __msa_addv_h(t1, (v8i16)src1); \
t4 = __msa_srli_w((v4i32)t0, 16); \
t5 = __msa_srli_w((v4i32)t1, 16); \
DOTP_SB2_SH(t4, t5, c1, c1, t2, t3); \
SRAI_H2_SH(t2, t3, 5); \
ADD2(t0, t2, t1, t3, t0, t1); \
VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1); \
} while (0)
#define TRANSFORM_COLOR_INVERSE_4(src, dst, c0, c1, mask0, mask1) do { \
const v16i8 g0 = VSHF_SB(src, src, mask0); \
v8i16 t0 = __msa_dotp_s_h(c0, g0); \
v8i16 t1; \
v4i32 t2; \
t0 = SRAI_H(t0, 5); \
t0 = __msa_addv_h(t0, (v8i16)src); \
t2 = __msa_srli_w((v4i32)t0, 16); \
t1 = __msa_dotp_s_h(c1, (v16i8)t2); \
t1 = SRAI_H(t1, 5); \
t0 = t0 + t1; \
dst = VSHF_UB(src, t0, mask1); \
} while (0)
static void ConvertBGRAToRGBA_MSA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
int i;
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
v16u8 src0, dst0;
const v16u8 mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 };
while (num_pixels >= 8) {
v16u8 src1, dst1;
LD_UB2(ptemp_src, 16, src0, src1);
VSHF_B2_UB(src0, src0, src1, src1, mask, mask, dst0, dst1);
ST_UB2(dst0, dst1, ptemp_dst, 16);
ptemp_src += 32;
ptemp_dst += 32;
num_pixels -= 8;
}
if (num_pixels > 0) {
if (num_pixels >= 4) {
src0 = LD_UB(ptemp_src);
dst0 = VSHF_UB(src0, src0, mask);
ST_UB(dst0, ptemp_dst);
ptemp_src += 16;
ptemp_dst += 16;
num_pixels -= 4;
}
for (i = 0; i < num_pixels; i++) {
const uint8_t b = ptemp_src[2];
const uint8_t g = ptemp_src[1];
const uint8_t r = ptemp_src[0];
const uint8_t a = ptemp_src[3];
ptemp_dst[0] = b;
ptemp_dst[1] = g;
ptemp_dst[2] = r;
ptemp_dst[3] = a;
ptemp_src += 4;
ptemp_dst += 4;
}
}
}
static void ConvertBGRAToBGR_MSA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
16, 17, 18, 20 };
const v16u8 mask1 = { 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20,
21, 22, 24, 25 };
const v16u8 mask2 = { 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25,
26, 28, 29, 30 };
while (num_pixels >= 16) {
CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
ptemp_src += 64;
ptemp_dst += 48;
num_pixels -= 16;
}
if (num_pixels > 0) {
if (num_pixels >= 12) {
CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
ptemp_src += 48;
ptemp_dst += 36;
num_pixels -= 12;
} else if (num_pixels >= 8) {
CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
ptemp_src += 32;
ptemp_dst += 24;
num_pixels -= 8;
} else if (num_pixels >= 4) {
CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
ptemp_src += 16;
ptemp_dst += 12;
num_pixels -= 4;
}
if (num_pixels == 3) {
CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
CONVERT1_BGRA_BGR(ptemp_src + 8, ptemp_dst + 6);
} else if (num_pixels == 2) {
CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
} else if (num_pixels == 1) {
CONVERT1_BGRA_BGR(ptemp_src, ptemp_dst);
}
}
}
static void ConvertBGRAToRGB_MSA(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint8_t* ptemp_src = (const uint8_t*)src;
uint8_t* ptemp_dst = (uint8_t*)dst;
const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
18, 17, 16, 22 };
const v16u8 mask1 = { 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22,
21, 20, 26, 25 };
const v16u8 mask2 = { 8, 14, 13, 12, 18, 17, 16, 22, 21, 20, 26, 25,
24, 30, 29, 28 };
while (num_pixels >= 16) {
CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
ptemp_src += 64;
ptemp_dst += 48;
num_pixels -= 16;
}
if (num_pixels) {
if (num_pixels >= 12) {
CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
ptemp_src += 48;
ptemp_dst += 36;
num_pixels -= 12;
} else if (num_pixels >= 8) {
CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
ptemp_src += 32;
ptemp_dst += 24;
num_pixels -= 8;
} else if (num_pixels >= 4) {
CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
ptemp_src += 16;
ptemp_dst += 12;
num_pixels -= 4;
}
if (num_pixels == 3) {
CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
CONVERT1_BGRA_RGB(ptemp_src + 8, ptemp_dst + 6);
} else if (num_pixels == 2) {
CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
} else if (num_pixels == 1) {
CONVERT1_BGRA_RGB(ptemp_src, ptemp_dst);
}
}
}
static void AddGreenToBlueAndRed_MSA(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i;
const uint8_t* in = (const uint8_t*)src;
uint8_t* out = (uint8_t*)dst;
v16u8 src0, dst0, tmp0;
const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
13, 255, 13, 255 };
while (num_pixels >= 8) {
v16u8 src1, dst1, tmp1;
LD_UB2(in, 16, src0, src1);
VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
ADD2(src0, tmp0, src1, tmp1, dst0, dst1);
ST_UB2(dst0, dst1, out, 16);
in += 32;
out += 32;
num_pixels -= 8;
}
if (num_pixels > 0) {
if (num_pixels >= 4) {
src0 = LD_UB(in);
tmp0 = VSHF_UB(src0, src0, mask);
dst0 = src0 + tmp0;
ST_UB(dst0, out);
in += 16;
out += 16;
num_pixels -= 4;
}
for (i = 0; i < num_pixels; i++) {
const uint8_t b = in[0];
const uint8_t g = in[1];
const uint8_t r = in[2];
out[0] = (b + g) & 0xff;
out[1] = g;
out[2] = (r + g) & 0xff;
out[4] = in[4];
out += 4;
}
}
}
static void TransformColorInverse_MSA(const VP8LMultipliers* const m,
const uint32_t* src, int num_pixels,
uint32_t* dst) {
v16u8 src0, dst0;
const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
(m->green_to_red_ << 16));
const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
13, 255, 13, 255 };
const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
28, 13, 30, 15 };
while (num_pixels >= 8) {
v16u8 src1, dst1;
LD_UB2(src, 4, src0, src1);
TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
ST_UB2(dst0, dst1, dst, 4);
src += 8;
dst += 8;
num_pixels -= 8;
}
if (num_pixels > 0) {
if (num_pixels >= 4) {
src0 = LD_UB(src);
TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
ST_UB(dst0, dst);
src += 4;
dst += 4;
num_pixels -= 4;
}
if (num_pixels > 0) {
src0 = LD_UB(src);
TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
if (num_pixels == 3) {
const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
SD(pix_d, dst + 0);
SW(pix_w, dst + 2);
} else if (num_pixels == 2) {
const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
SD(pix_d, dst);
} else {
const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
SW(pix_w, dst);
}
}
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_MSA;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_MSA;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_MSA;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_MSA;
VP8LTransformColorInverse = TransformColorInverse_MSA;
}
#else // !WEBP_USE_MSA
WEBP_DSP_INIT_STUB(VP8LDspInitMSA)
#endif // WEBP_USE_MSA

View File

@ -0,0 +1,641 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON variant of methods for lossless decoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_NEON)
#include <arm_neon.h>
#include "dsp_lossless.h"
#include "dsp_neon.h"
//------------------------------------------------------------------------------
// Colorspace conversion functions
#if !defined(WORK_AROUND_GCC)
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
// gcc-4.8.x at least.
static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
// swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
const uint8x16_t tmp = pixel.val[0];
pixel.val[0] = pixel.val[2];
pixel.val[2] = tmp;
vst4q_u8(dst, pixel);
dst += 64;
}
VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst); // left-overs
}
static void ConvertBGRAToBGR_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
const uint8x16x3_t tmp = { { pixel.val[0], pixel.val[1], pixel.val[2] } };
vst3q_u8(dst, tmp);
dst += 48;
}
VP8LConvertBGRAToBGR_C(src, num_pixels & 15, dst); // left-overs
}
static void ConvertBGRAToRGB_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~15);
for (; src < end; src += 16) {
const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
vst3q_u8(dst, tmp);
dst += 48;
}
VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst); // left-overs
}
#else // WORK_AROUND_GCC
// gcc-4.6.0 fallback
static const uint8_t kRGBAShuffle[8] = { 2, 1, 0, 3, 6, 5, 4, 7 };
static void ConvertBGRAToRGBA_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~1);
const uint8x8_t shuffle = vld1_u8(kRGBAShuffle);
for (; src < end; src += 2) {
const uint8x8_t pixels = vld1_u8((uint8_t*)src);
vst1_u8(dst, vtbl1_u8(pixels, shuffle));
dst += 8;
}
VP8LConvertBGRAToRGBA_C(src, num_pixels & 1, dst); // left-overs
}
static const uint8_t kBGRShuffle[3][8] = {
{ 0, 1, 2, 4, 5, 6, 8, 9 },
{ 10, 12, 13, 14, 16, 17, 18, 20 },
{ 21, 22, 24, 25, 26, 28, 29, 30 }
};
static void ConvertBGRAToBGR_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kBGRShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kBGRShuffle[1]);
const uint8x8_t shuffle2 = vld1_u8(kBGRShuffle[2]);
for (; src < end; src += 8) {
uint8x8x4_t pixels;
INIT_VECTOR4(pixels,
vld1_u8((const uint8_t*)(src + 0)),
vld1_u8((const uint8_t*)(src + 2)),
vld1_u8((const uint8_t*)(src + 4)),
vld1_u8((const uint8_t*)(src + 6)));
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
dst += 8 * 3;
}
VP8LConvertBGRAToBGR_C(src, num_pixels & 7, dst); // left-overs
}
static const uint8_t kRGBShuffle[3][8] = {
{ 2, 1, 0, 6, 5, 4, 10, 9 },
{ 8, 14, 13, 12, 18, 17, 16, 22 },
{ 21, 20, 26, 25, 24, 30, 29, 28 }
};
static void ConvertBGRAToRGB_NEON(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const uint32_t* const end = src + (num_pixels & ~7);
const uint8x8_t shuffle0 = vld1_u8(kRGBShuffle[0]);
const uint8x8_t shuffle1 = vld1_u8(kRGBShuffle[1]);
const uint8x8_t shuffle2 = vld1_u8(kRGBShuffle[2]);
for (; src < end; src += 8) {
uint8x8x4_t pixels;
INIT_VECTOR4(pixels,
vld1_u8((const uint8_t*)(src + 0)),
vld1_u8((const uint8_t*)(src + 2)),
vld1_u8((const uint8_t*)(src + 4)),
vld1_u8((const uint8_t*)(src + 6)));
vst1_u8(dst + 0, vtbl4_u8(pixels, shuffle0));
vst1_u8(dst + 8, vtbl4_u8(pixels, shuffle1));
vst1_u8(dst + 16, vtbl4_u8(pixels, shuffle2));
dst += 8 * 3;
}
VP8LConvertBGRAToRGB_C(src, num_pixels & 7, dst); // left-overs
}
#endif // !WORK_AROUND_GCC
//------------------------------------------------------------------------------
// Predictor Transform
#define LOAD_U32_AS_U8(IN) vreinterpret_u8_u32(vdup_n_u32((IN)))
#define LOAD_U32P_AS_U8(IN) vreinterpret_u8_u32(vld1_u32((IN)))
#define LOADQ_U32_AS_U8(IN) vreinterpretq_u8_u32(vdupq_n_u32((IN)))
#define LOADQ_U32P_AS_U8(IN) vreinterpretq_u8_u32(vld1q_u32((IN)))
#define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0);
#define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0);
#define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN)));
#define ROTATE32_LEFT(L) vextq_u8((L), (L), 12) // D|C|B|A -> C|B|A|D
static WEBP_INLINE uint8x8_t Average2_u8_NEON(uint32_t a0, uint32_t a1) {
const uint8x8_t A0 = LOAD_U32_AS_U8(a0);
const uint8x8_t A1 = LOAD_U32_AS_U8(a1);
return vhadd_u8(A0, A1);
}
static WEBP_INLINE uint32_t ClampedAddSubtractHalf_NEON(uint32_t c0,
uint32_t c1,
uint32_t c2) {
const uint8x8_t avg = Average2_u8_NEON(c0, c1);
// Remove one to c2 when bigger than avg.
const uint8x8_t C2 = LOAD_U32_AS_U8(c2);
const uint8x8_t cmp = vcgt_u8(C2, avg);
const uint8x8_t C2_1 = vadd_u8(C2, cmp);
// Compute half of the difference between avg and c2.
const int8x8_t diff_avg = vreinterpret_s8_u8(vhsub_u8(avg, C2_1));
// Compute the sum with avg and saturate.
const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(avg));
const uint8x8_t res = vqmovun_s16(vaddw_s8(avg_16, diff_avg));
const uint32_t output = GET_U8_AS_U32(res);
return output;
}
static WEBP_INLINE uint32_t Average2_NEON(uint32_t a0, uint32_t a1) {
const uint8x8_t avg_u8x8 = Average2_u8_NEON(a0, a1);
const uint32_t avg = GET_U8_AS_U32(avg_u8x8);
return avg;
}
static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
uint32_t a2) {
const uint8x8_t avg0 = Average2_u8_NEON(a0, a2);
const uint8x8_t A1 = LOAD_U32_AS_U8(a1);
const uint32_t avg = GET_U8_AS_U32(vhadd_u8(avg0, A1));
return avg;
}
static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
return Average3_NEON(left, top[0], top[1]);
}
static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
return Average2_NEON(left, top[-1]);
}
static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
return Average2_NEON(left, top[0]);
}
static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
}
// Batch versions of those functions.
// Predictor0: ARGB_BLACK.
static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
const uint8x16_t res = vaddq_u8(src, black);
STOREQ_U8_AS_U32P(&out[i], res);
}
VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
}
// Predictor1: left.
static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const uint8x16_t zero = LOADQ_U32_AS_U8(0);
for (i = 0; i + 4 <= num_pixels; i += 4) {
// a | b | c | d
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
// 0 | a | b | c
const uint8x16_t shift0 = vextq_u8(zero, src, 12);
// a | a + b | b + c | c + d
const uint8x16_t sum0 = vaddq_u8(src, shift0);
// 0 | 0 | a | a + b
const uint8x16_t shift1 = vextq_u8(zero, sum0, 8);
// a | a + b | a + b + c | a + b + c + d
const uint8x16_t sum1 = vaddq_u8(sum0, shift1);
const uint8x16_t prev = LOADQ_U32_AS_U8(out[i - 1]);
const uint8x16_t res = vaddq_u8(sum1, prev);
STOREQ_U8_AS_U32P(&out[i], res);
}
VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
}
// Macro that adds 32-bit integers from IN using mod 256 arithmetic
// per 8 bit channel.
#define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorAdd##X##_NEON(const uint32_t* in, \
const uint32_t* upper, int num_pixels, \
uint32_t* out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \
const uint8x16_t other = LOADQ_U32P_AS_U8(&(IN)); \
const uint8x16_t res = vaddq_u8(src, other); \
STOREQ_U8_AS_U32P(&out[i], res); \
} \
VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
}
// Predictor2: Top.
GENERATE_PREDICTOR_1(2, upper[i])
// Predictor3: Top-right.
GENERATE_PREDICTOR_1(3, upper[i + 1])
// Predictor4: Top-left.
GENERATE_PREDICTOR_1(4, upper[i - 1])
#undef GENERATE_PREDICTOR_1
// Predictor5: average(average(left, TR), T)
#define DO_PRED5(LANE) do { \
const uint8x16_t avgLTR = vhaddq_u8(L, TR); \
const uint8x16_t avg = vhaddq_u8(avgLTR, T); \
const uint8x16_t res = vaddq_u8(avg, src); \
vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \
L = ROTATE32_LEFT(res); \
} while (0)
static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i + 0]);
const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);
DO_PRED5(0);
DO_PRED5(1);
DO_PRED5(2);
DO_PRED5(3);
}
VP8LPredictorsAdd_C[5](in + i, upper + i, num_pixels - i, out + i);
}
#undef DO_PRED5
#define DO_PRED67(LANE) do { \
const uint8x16_t avg = vhaddq_u8(L, top); \
const uint8x16_t res = vaddq_u8(avg, src); \
vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \
L = ROTATE32_LEFT(res); \
} while (0)
// Predictor6: average(left, TL)
static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i - 1]);
DO_PRED67(0);
DO_PRED67(1);
DO_PRED67(2);
DO_PRED67(3);
}
VP8LPredictorsAdd_C[6](in + i, upper + i, num_pixels - i, out + i);
}
// Predictor7: average(left, T)
static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i]);
DO_PRED67(0);
DO_PRED67(1);
DO_PRED67(2);
DO_PRED67(3);
}
VP8LPredictorsAdd_C[7](in + i, upper + i, num_pixels - i, out + i);
}
#undef DO_PRED67
#define GENERATE_PREDICTOR_2(X, IN) \
static void PredictorAdd##X##_NEON(const uint32_t* in, \
const uint32_t* upper, int num_pixels, \
uint32_t* out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]); \
const uint8x16_t Tother = LOADQ_U32P_AS_U8(&(IN)); \
const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]); \
const uint8x16_t avg = vhaddq_u8(T, Tother); \
const uint8x16_t res = vaddq_u8(avg, src); \
STOREQ_U8_AS_U32P(&out[i], res); \
} \
VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
}
// Predictor8: average TL T.
GENERATE_PREDICTOR_2(8, upper[i - 1])
// Predictor9: average T TR.
GENERATE_PREDICTOR_2(9, upper[i + 1])
#undef GENERATE_PREDICTOR_2
// Predictor10: average of (average of (L,TL), average of (T, TR)).
#define DO_PRED10(LANE) do { \
const uint8x16_t avgLTL = vhaddq_u8(L, TL); \
const uint8x16_t avg = vhaddq_u8(avgTTR, avgLTL); \
const uint8x16_t res = vaddq_u8(avg, src); \
vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \
L = ROTATE32_LEFT(res); \
} while (0)
static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);
const uint8x16_t avgTTR = vhaddq_u8(T, TR);
DO_PRED10(0);
DO_PRED10(1);
DO_PRED10(2);
DO_PRED10(3);
}
VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
}
#undef DO_PRED10
// Predictor11: select.
#define DO_PRED11(LANE) do { \
const uint8x16_t sumLin = vaddq_u8(L, src); /* in + L */ \
const uint8x16_t pLTL = vabdq_u8(L, TL); /* |L - TL| */ \
const uint16x8_t sum_LTL = vpaddlq_u8(pLTL); \
const uint32x4_t pa = vpaddlq_u16(sum_LTL); \
const uint32x4_t mask = vcleq_u32(pa, pb); \
const uint8x16_t res = vbslq_u8(vreinterpretq_u8_u32(mask), sumTin, sumLin); \
vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE)); \
L = ROTATE32_LEFT(res); \
} while (0)
static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
const uint8x16_t pTTL = vabdq_u8(T, TL); // |T - TL|
const uint16x8_t sum_TTL = vpaddlq_u8(pTTL);
const uint32x4_t pb = vpaddlq_u16(sum_TTL);
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
const uint8x16_t sumTin = vaddq_u8(T, src); // in + T
DO_PRED11(0);
DO_PRED11(1);
DO_PRED11(2);
DO_PRED11(3);
}
VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
}
#undef DO_PRED11
// Predictor12: ClampedAddSubtractFull.
#define DO_PRED12(DIFF, LANE) do { \
const uint8x8_t pred = \
vqmovun_s16(vaddq_s16(vreinterpretq_s16_u16(L), (DIFF))); \
const uint8x8_t res = \
vadd_u8(pred, (LANE <= 1) ? vget_low_u8(src) : vget_high_u8(src)); \
const uint16x8_t res16 = vmovl_u8(res); \
vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \
/* rotate in the left predictor for next iteration */ \
L = vextq_u16(res16, res16, 4); \
} while (0)
static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));
for (i = 0; i + 4 <= num_pixels; i += 4) {
// load four pixels of source
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
// precompute the difference T - TL once for all, stored as s16
const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
const int16x8_t diff_lo =
vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), vget_low_u8(TL)));
const int16x8_t diff_hi =
vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), vget_high_u8(TL)));
// loop over the four reconstructed pixels
DO_PRED12(diff_lo, 0);
DO_PRED12(diff_lo, 1);
DO_PRED12(diff_hi, 2);
DO_PRED12(diff_hi, 3);
}
VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
}
#undef DO_PRED12
// Predictor13: ClampedAddSubtractHalf
#define DO_PRED13(LANE, LOW_OR_HI) do { \
const uint8x16_t avg = vhaddq_u8(L, T); \
const uint8x16_t cmp = vcgtq_u8(TL, avg); \
const uint8x16_t TL_1 = vaddq_u8(TL, cmp); \
/* Compute half of the difference between avg and TL'. */ \
const int8x8_t diff_avg = \
vreinterpret_s8_u8(LOW_OR_HI(vhsubq_u8(avg, TL_1))); \
/* Compute the sum with avg and saturate. */ \
const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(LOW_OR_HI(avg))); \
const uint8x8_t delta = vqmovun_s16(vaddw_s8(avg_16, diff_avg)); \
const uint8x8_t res = vadd_u8(LOW_OR_HI(src), delta); \
const uint8x16_t res2 = vcombine_u8(res, res); \
vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \
L = ROTATE32_LEFT(res2); \
} while (0)
static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
DO_PRED13(0, vget_low_u8);
DO_PRED13(1, vget_low_u8);
DO_PRED13(2, vget_high_u8);
DO_PRED13(3, vget_high_u8);
}
VP8LPredictorsAdd_C[13](in + i, upper + i, num_pixels - i, out + i);
}
#undef DO_PRED13
#undef LOAD_U32_AS_U8
#undef LOAD_U32P_AS_U8
#undef LOADQ_U32_AS_U8
#undef LOADQ_U32P_AS_U8
#undef GET_U8_AS_U32
#undef GETQ_U8_AS_U32
#undef STOREQ_U8_AS_U32P
#undef ROTATE32_LEFT
//------------------------------------------------------------------------------
// Subtract-Green Transform
// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
// non-standard versions there.
#if defined(__APPLE__) && defined(__aarch64__) && \
defined(__apple_build_version__) && (__apple_build_version__< 6020037)
#define USE_VTBLQ
#endif
#ifdef USE_VTBLQ
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[16] = {
1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
};
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x16_t shuffle) {
return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
vtbl1q_u8(argb, vget_high_u8(shuffle)));
}
#else // !USE_VTBLQ
// 255 = byte will be zeroed
static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255 };
static WEBP_INLINE uint8x16_t DoGreenShuffle_NEON(const uint8x16_t argb,
const uint8x8_t shuffle) {
return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
vtbl1_u8(vget_high_u8(argb), shuffle));
}
#endif // USE_VTBLQ
static void AddGreenToBlueAndRed_NEON(const uint32_t* src, int num_pixels,
uint32_t* dst) {
const uint32_t* const end = src + (num_pixels & ~3);
#ifdef USE_VTBLQ
const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
#else
const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
#endif
for (; src < end; src += 4, dst += 4) {
const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
const uint8x16_t greens = DoGreenShuffle_NEON(argb, shuffle);
vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
}
// fallthrough and finish off with plain-C
VP8LAddGreenToBlueAndRed_C(src, num_pixels & 3, dst);
}
//------------------------------------------------------------------------------
// Color Transform
static void TransformColorInverse_NEON(const VP8LMultipliers* const m,
const uint32_t* const src,
int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 6.
#define CST(X) (((int16_t)(m->X << 8)) >> 6)
const int16_t rb[8] = {
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_),
CST(green_to_blue_), CST(green_to_red_)
};
const int16x8_t mults_rb = vld1q_s16(rb);
const int16_t b2[8] = {
0, CST(red_to_blue_), 0, CST(red_to_blue_),
0, CST(red_to_blue_), 0, CST(red_to_blue_),
};
const int16x8_t mults_b2 = vld1q_s16(b2);
#undef CST
#ifdef USE_VTBLQ
static const uint8_t kg0g0[16] = {
255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
};
const uint8x16_t shuffle = vld1q_u8(kg0g0);
#else
static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
const uint8x8_t shuffle = vld1_u8(k0g0g);
#endif
const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
// 0 g 0 g
const uint8x16_t greens = DoGreenShuffle_NEON(in, shuffle);
// x dr x db1
const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
// x r' x b'
const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in),
vreinterpretq_s8_s16(A));
// r' 0 b' 0
const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8);
// x db2 0 0
const int16x8_t D = vqdmulhq_s16(C, mults_b2);
// 0 x db2 0
const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8);
// r' x b'' 0
const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E),
vreinterpretq_s8_s16(C));
// 0 r' 0 b''
const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);
const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);
vst1q_u32(dst + i, out);
}
// Fall-back to C-version for left-overs.
VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
}
#undef USE_VTBLQ
//------------------------------------------------------------------------------
// Entry point
extern void VP8LDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
VP8LPredictors[5] = Predictor5_NEON;
VP8LPredictors[6] = Predictor6_NEON;
VP8LPredictors[7] = Predictor7_NEON;
VP8LPredictors[13] = Predictor13_NEON;
VP8LPredictorsAdd[0] = PredictorAdd0_NEON;
VP8LPredictorsAdd[1] = PredictorAdd1_NEON;
VP8LPredictorsAdd[2] = PredictorAdd2_NEON;
VP8LPredictorsAdd[3] = PredictorAdd3_NEON;
VP8LPredictorsAdd[4] = PredictorAdd4_NEON;
VP8LPredictorsAdd[5] = PredictorAdd5_NEON;
VP8LPredictorsAdd[6] = PredictorAdd6_NEON;
VP8LPredictorsAdd[7] = PredictorAdd7_NEON;
VP8LPredictorsAdd[8] = PredictorAdd8_NEON;
VP8LPredictorsAdd[9] = PredictorAdd9_NEON;
VP8LPredictorsAdd[10] = PredictorAdd10_NEON;
VP8LPredictorsAdd[11] = PredictorAdd11_NEON;
VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_NEON;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_NEON;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_NEON;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_NEON;
VP8LTransformColorInverse = TransformColorInverse_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(VP8LDspInitNEON)
#endif // WEBP_USE_NEON

View File

@ -0,0 +1,708 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 variant of methods for lossless decoder
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE2)
#include "dsp_common_sse2.h"
#include "dsp_lossless.h"
#include "dsp_lossless_common.h"
#include <assert.h>
#include <emmintrin.h>
//------------------------------------------------------------------------------
// Predictor Transform
static WEBP_INLINE uint32_t ClampedAddSubtractFull_SSE2(uint32_t c0,
uint32_t c1,
uint32_t c2) {
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
const __m128i V1 = _mm_add_epi16(C0, C1);
const __m128i V2 = _mm_sub_epi16(V1, C2);
const __m128i b = _mm_packus_epi16(V2, V2);
const uint32_t output = _mm_cvtsi128_si32(b);
return output;
}
static WEBP_INLINE uint32_t ClampedAddSubtractHalf_SSE2(uint32_t c0,
uint32_t c1,
uint32_t c2) {
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
const __m128i avg = _mm_add_epi16(C1, C0);
const __m128i A0 = _mm_srli_epi16(avg, 1);
const __m128i A1 = _mm_sub_epi16(A0, B0);
const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
const __m128i A2 = _mm_sub_epi16(A1, BgtA);
const __m128i A3 = _mm_srai_epi16(A2, 1);
const __m128i A4 = _mm_add_epi16(A0, A3);
const __m128i A5 = _mm_packus_epi16(A4, A4);
const uint32_t output = _mm_cvtsi128_si32(A5);
return output;
}
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
int pa_minus_pb;
const __m128i zero = _mm_setzero_si128();
const __m128i A0 = _mm_cvtsi32_si128(a);
const __m128i B0 = _mm_cvtsi32_si128(b);
const __m128i C0 = _mm_cvtsi32_si128(c);
const __m128i AC0 = _mm_subs_epu8(A0, C0);
const __m128i CA0 = _mm_subs_epu8(C0, A0);
const __m128i BC0 = _mm_subs_epu8(B0, C0);
const __m128i CB0 = _mm_subs_epu8(C0, B0);
const __m128i AC = _mm_or_si128(AC0, CA0);
const __m128i BC = _mm_or_si128(BC0, CB0);
const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c|
const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c|
const __m128i diff = _mm_sub_epi16(pb, pa);
{
int16_t out[8];
_mm_storeu_si128((__m128i*)out, diff);
pa_minus_pb = out[0] + out[1] + out[2] + out[3];
}
return (pa_minus_pb <= 0) ? a : b;
}
static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
const __m128i* const a1,
__m128i* const avg) {
// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
const __m128i ones = _mm_set1_epi8(1);
const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
*avg = _mm_sub_epi8(avg1, one);
}
static WEBP_INLINE void Average2_uint32_SSE2(const uint32_t a0,
const uint32_t a1,
__m128i* const avg) {
// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
const __m128i ones = _mm_set1_epi8(1);
const __m128i A0 = _mm_cvtsi32_si128(a0);
const __m128i A1 = _mm_cvtsi32_si128(a1);
const __m128i avg1 = _mm_avg_epu8(A0, A1);
const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
*avg = _mm_sub_epi8(avg1, one);
}
static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) {
const __m128i zero = _mm_setzero_si128();
const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
const __m128i sum = _mm_add_epi16(A1, A0);
return _mm_srli_epi16(sum, 1);
}
static WEBP_INLINE uint32_t Average2_SSE2(uint32_t a0, uint32_t a1) {
__m128i output;
Average2_uint32_SSE2(a0, a1, &output);
return _mm_cvtsi128_si32(output);
}
static WEBP_INLINE uint32_t Average3_SSE2(uint32_t a0, uint32_t a1,
uint32_t a2) {
const __m128i zero = _mm_setzero_si128();
const __m128i avg1 = Average2_uint32_16_SSE2(a0, a2);
const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
const __m128i sum = _mm_add_epi16(avg1, A1);
const __m128i avg2 = _mm_srli_epi16(sum, 1);
const __m128i A2 = _mm_packus_epi16(avg2, avg2);
const uint32_t output = _mm_cvtsi128_si32(A2);
return output;
}
static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1,
uint32_t a2, uint32_t a3) {
const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1);
const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3);
const __m128i sum = _mm_add_epi16(avg2, avg1);
const __m128i avg3 = _mm_srli_epi16(sum, 1);
const __m128i A0 = _mm_packus_epi16(avg3, avg3);
const uint32_t output = _mm_cvtsi128_si32(A0);
return output;
}
static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average3_SSE2(left, top[0], top[1]);
return pred;
}
static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2_SSE2(left, top[-1]);
return pred;
}
static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2_SSE2(left, top[0]);
return pred;
}
static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2_SSE2(top[-1], top[0]);
(void)left;
return pred;
}
static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average2_SSE2(top[0], top[1]);
(void)left;
return pred;
}
static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Average4_SSE2(left, top[-1], top[0], top[1]);
return pred;
}
static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = Select_SSE2(top[0], left, top[-1]);
return pred;
}
static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractFull_SSE2(left, top[0], top[-1]);
return pred;
}
static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
const uint32_t pred = ClampedAddSubtractHalf_SSE2(left, top[0], top[-1]);
return pred;
}
// Batch versions of those functions.
// Predictor0: ARGB_BLACK.
static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const __m128i black = _mm_set1_epi32(ARGB_BLACK);
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i res = _mm_add_epi8(src, black);
_mm_storeu_si128((__m128i*)&out[i], res);
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[0](in + i, NULL, num_pixels - i, out + i);
}
(void)upper;
}
// Predictor1: left.
static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
__m128i prev = _mm_set1_epi32(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
// a | b | c | d
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
// 0 | a | b | c
const __m128i shift0 = _mm_slli_si128(src, 4);
// a | a + b | b + c | c + d
const __m128i sum0 = _mm_add_epi8(src, shift0);
// 0 | 0 | a | a + b
const __m128i shift1 = _mm_slli_si128(sum0, 8);
// a | a + b | a + b + c | a + b + c + d
const __m128i sum1 = _mm_add_epi8(sum0, shift1);
const __m128i res = _mm_add_epi8(sum1, prev);
_mm_storeu_si128((__m128i*)&out[i], res);
// replicate prev output on the four lanes
prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
}
}
// Macro that adds 32-bit integers from IN using mod 256 arithmetic
// per 8 bit channel.
#define GENERATE_PREDICTOR_1(X, IN) \
static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
const __m128i other = _mm_loadu_si128((const __m128i*)&(IN)); \
const __m128i res = _mm_add_epi8(src, other); \
_mm_storeu_si128((__m128i*)&out[i], res); \
} \
if (i != num_pixels) { \
VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
} \
}
// Predictor2: Top.
GENERATE_PREDICTOR_1(2, upper[i])
// Predictor3: Top-right.
GENERATE_PREDICTOR_1(3, upper[i + 1])
// Predictor4: Top-left.
GENERATE_PREDICTOR_1(4, upper[i - 1])
#undef GENERATE_PREDICTOR_1
// Due to averages with integers, values cannot be accumulated in parallel for
// predictors 5 to 7.
GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
#define GENERATE_PREDICTOR_2(X, IN) \
static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
int num_pixels, uint32_t* out) { \
int i; \
for (i = 0; i + 4 <= num_pixels; i += 4) { \
const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN)); \
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); \
const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); \
__m128i avg, res; \
Average2_m128i(&T, &Tother, &avg); \
res = _mm_add_epi8(avg, src); \
_mm_storeu_si128((__m128i*)&out[i], res); \
} \
if (i != num_pixels) { \
VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i); \
} \
}
// Predictor8: average TL T.
GENERATE_PREDICTOR_2(8, upper[i - 1])
// Predictor9: average T TR.
GENERATE_PREDICTOR_2(9, upper[i + 1])
#undef GENERATE_PREDICTOR_2
// Predictor10: average of (average of (L,TL), average of (T, TR)).
#define DO_PRED10(OUT) do { \
__m128i avgLTL, avg; \
Average2_m128i(&L, &TL, &avgLTL); \
Average2_m128i(&avgTTR, &avgLTL, &avg); \
L = _mm_add_epi8(avg, src); \
out[i + (OUT)] = _mm_cvtsi128_si32(L); \
} while (0)
#define DO_PRED10_SHIFT do { \
/* Rotate the pre-computed values for the next iteration.*/ \
avgTTR = _mm_srli_si128(avgTTR, 4); \
TL = _mm_srli_si128(TL, 4); \
src = _mm_srli_si128(src, 4); \
} while (0)
static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
__m128i L = _mm_cvtsi32_si128(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
__m128i avgTTR;
Average2_m128i(&T, &TR, &avgTTR);
DO_PRED10(0);
DO_PRED10_SHIFT;
DO_PRED10(1);
DO_PRED10_SHIFT;
DO_PRED10(2);
DO_PRED10_SHIFT;
DO_PRED10(3);
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
}
}
#undef DO_PRED10
#undef DO_PRED10_SHIFT
// Predictor11: select.
#define DO_PRED11(OUT) do { \
const __m128i L_lo = _mm_unpacklo_epi32(L, T); \
const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); \
const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \
const __m128i mask = _mm_cmpgt_epi32(pb, pa); \
const __m128i A = _mm_and_si128(mask, L); \
const __m128i B = _mm_andnot_si128(mask, T); \
const __m128i pred = _mm_or_si128(A, B); /* pred = (pa > b)? L : T*/ \
L = _mm_add_epi8(src, pred); \
out[i + (OUT)] = _mm_cvtsi128_si32(L); \
} while (0)
#define DO_PRED11_SHIFT do { \
/* Shift the pre-computed value for the next iteration.*/ \
T = _mm_srli_si128(T, 4); \
TL = _mm_srli_si128(TL, 4); \
src = _mm_srli_si128(src, 4); \
pa = _mm_srli_si128(pa, 4); \
} while (0)
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
__m128i pa;
__m128i L = _mm_cvtsi32_si128(out[-1]);
for (i = 0; i + 4 <= num_pixels; i += 4) {
__m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
__m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
{
// We can unpack with any value on the upper 32 bits, provided it's the
// same on both operands (so that their sum of abs diff is zero). Here we
// use T.
const __m128i T_lo = _mm_unpacklo_epi32(T, T);
const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
const __m128i T_hi = _mm_unpackhi_epi32(T, T);
const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|
}
DO_PRED11(0);
DO_PRED11_SHIFT;
DO_PRED11(1);
DO_PRED11_SHIFT;
DO_PRED11(2);
DO_PRED11_SHIFT;
DO_PRED11(3);
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
}
}
#undef DO_PRED11
#undef DO_PRED11_SHIFT
// Predictor12: ClampedAddSubtractFull.
#define DO_PRED12(DIFF, LANE, OUT) do { \
const __m128i all = _mm_add_epi16(L, (DIFF)); \
const __m128i alls = _mm_packus_epi16(all, all); \
const __m128i res = _mm_add_epi8(src, alls); \
out[i + (OUT)] = _mm_cvtsi128_si32(res); \
L = _mm_unpacklo_epi8(res, zero); \
} while (0)
#define DO_PRED12_SHIFT(DIFF, LANE) do { \
/* Shift the pre-computed value for the next iteration.*/ \
if ((LANE) == 0) (DIFF) = _mm_srli_si128((DIFF), 8); \
src = _mm_srli_si128(src, 4); \
} while (0)
static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
int num_pixels, uint32_t* out) {
int i;
const __m128i zero = _mm_setzero_si128();
const __m128i L8 = _mm_cvtsi32_si128(out[-1]);
__m128i L = _mm_unpacklo_epi8(L8, zero);
for (i = 0; i + 4 <= num_pixels; i += 4) {
// Load 4 pixels at a time.
__m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
__m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
__m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
DO_PRED12(diff_lo, 0, 0);
DO_PRED12_SHIFT(diff_lo, 0);
DO_PRED12(diff_lo, 1, 1);
DO_PRED12_SHIFT(diff_lo, 1);
DO_PRED12(diff_hi, 0, 2);
DO_PRED12_SHIFT(diff_hi, 0);
DO_PRED12(diff_hi, 1, 3);
}
if (i != num_pixels) {
VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
}
}
#undef DO_PRED12
#undef DO_PRED12_SHIFT
// Due to averages with integers, values cannot be accumulated in parallel for
// predictors 13.
GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
//------------------------------------------------------------------------------
// Subtract-Green Transform
static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
uint32_t* dst) {
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g
const __m128i out = _mm_add_epi8(in, C);
_mm_storeu_si128((__m128i*)&dst[i], out);
}
// fallthrough and finish off with plain-C
if (i != num_pixels) {
VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
}
}
//------------------------------------------------------------------------------
// Color Transform
static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
const uint32_t* const src,
int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend
#define MK_CST_16(HI, LO) \
_mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
#undef MK_CST_16
#undef CST
const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks
int i;
for (i = 0; i + 4 <= num_pixels; i += 4) {
const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0
const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0
const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1
const __m128i E = _mm_add_epi8(in, D); // x r' x b'
const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0
const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0
const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0
const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0
const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b''
const __m128i out = _mm_or_si128(J, A);
_mm_storeu_si128((__m128i*)&dst[i], out);
}
// Fall-back to C-version for left-overs.
if (i != num_pixels) {
VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
}
}
//------------------------------------------------------------------------------
// Color-space conversion functions
static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
uint8_t* dst) {
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
while (num_pixels >= 32) {
// Load the BGRA buffers.
__m128i in0 = _mm_loadu_si128(in + 0);
__m128i in1 = _mm_loadu_si128(in + 1);
__m128i in2 = _mm_loadu_si128(in + 2);
__m128i in3 = _mm_loadu_si128(in + 3);
__m128i in4 = _mm_loadu_si128(in + 4);
__m128i in5 = _mm_loadu_si128(in + 5);
__m128i in6 = _mm_loadu_si128(in + 6);
__m128i in7 = _mm_loadu_si128(in + 7);
VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
// At this points, in1/in5 contains red only, in2/in6 green only ...
// Pack the colors in 24b RGB.
VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
_mm_storeu_si128(out + 0, in1);
_mm_storeu_si128(out + 1, in5);
_mm_storeu_si128(out + 2, in2);
_mm_storeu_si128(out + 3, in6);
_mm_storeu_si128(out + 4, in3);
_mm_storeu_si128(out + 5, in7);
in += 8;
out += 6;
num_pixels -= 32;
}
// left-overs
if (num_pixels > 0) {
VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
}
}
static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
while (num_pixels >= 8) {
const __m128i A1 = _mm_loadu_si128(in++);
const __m128i A2 = _mm_loadu_si128(in++);
const __m128i B1 = _mm_and_si128(A1, red_blue_mask); // R 0 B 0
const __m128i B2 = _mm_and_si128(A2, red_blue_mask); // R 0 B 0
const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1); // 0 G 0 A
const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2); // 0 G 0 A
const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
const __m128i F1 = _mm_or_si128(E1, C1);
const __m128i F2 = _mm_or_si128(E2, C2);
_mm_storeu_si128(out++, F1);
_mm_storeu_si128(out++, F2);
num_pixels -= 8;
}
// left-overs
if (num_pixels > 0) {
VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
}
}
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
while (num_pixels >= 8) {
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7-
const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7
const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7-
const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7
const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0
#if (WEBP_SWAP_16BIT_CSP == 1)
const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7
#else
const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7
#endif
_mm_storeu_si128(out++, rgba);
num_pixels -= 8;
}
// left-overs
if (num_pixels > 0) {
VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
}
}
static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
const __m128i mask_0x07 = _mm_set1_epi8(0x07);
const __m128i* in = (const __m128i*)src;
__m128i* out = (__m128i*)dst;
while (num_pixels >= 8) {
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4...
const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6...
const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6...
const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7...
const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7
const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7
const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7
const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7
const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7
const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b)
const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b)
const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0
const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx
const __m128i b1 = _mm_srli_epi16(b0, 3);
const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx
#if (WEBP_SWAP_16BIT_CSP == 1)
const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7
#else
const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7
#endif
_mm_storeu_si128(out++, rgba);
num_pixels -= 8;
}
// left-overs
if (num_pixels > 0) {
VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
}
}
static void ConvertBGRAToBGR_SSE2(const uint32_t* src,
int num_pixels, uint8_t* dst) {
const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff);
const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0);
const __m128i* in = (const __m128i*)src;
const uint8_t* const end = dst + num_pixels * 3;
// the last storel_epi64 below writes 8 bytes starting at offset 18
while (dst + 26 <= end) {
const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3
const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7
const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0
const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0
const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0
const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0
const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00
const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00
const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00
const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00
const __m128i c2 = _mm_srli_si128(c0, 8);
const __m128i c6 = _mm_srli_si128(c4, 8);
_mm_storel_epi64((__m128i*)(dst + 0), c0);
_mm_storel_epi64((__m128i*)(dst + 6), c2);
_mm_storel_epi64((__m128i*)(dst + 12), c4);
_mm_storel_epi64((__m128i*)(dst + 18), c6);
dst += 24;
num_pixels -= 8;
}
// left-overs
if (num_pixels > 0) {
VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void VP8LDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
VP8LPredictors[5] = Predictor5_SSE2;
VP8LPredictors[6] = Predictor6_SSE2;
VP8LPredictors[7] = Predictor7_SSE2;
VP8LPredictors[8] = Predictor8_SSE2;
VP8LPredictors[9] = Predictor9_SSE2;
VP8LPredictors[10] = Predictor10_SSE2;
VP8LPredictors[11] = Predictor11_SSE2;
VP8LPredictors[12] = Predictor12_SSE2;
VP8LPredictors[13] = Predictor13_SSE2;
VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;
VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;
VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;
VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;
VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;
VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;
VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_SSE2;
VP8LTransformColorInverse = TransformColorInverse_SSE2;
VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE2;
VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_SSE2;
VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444_SSE2;
VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565_SSE2;
VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)
#endif // WEBP_USE_SSE2

View File

@ -0,0 +1,200 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS common macros
#ifndef WEBP_DSP_MIPS_MACRO_H_
#define WEBP_DSP_MIPS_MACRO_H_
#if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
#define WORK_AROUND_GCC
#endif
#define STR(s) #s
#define XSTR(s) STR(s)
// O0[31..16 | 15..0] = I0[31..16 | 15..0] + I1[31..16 | 15..0]
// O1[31..16 | 15..0] = I0[31..16 | 15..0] - I1[31..16 | 15..0]
// O - output
// I - input (macro doesn't change it)
#define ADD_SUB_HALVES(O0, O1, \
I0, I1) \
"addq.ph %[" #O0 "], %[" #I0 "], %[" #I1 "] \n\t" \
"subq.ph %[" #O1 "], %[" #I0 "], %[" #I1 "] \n\t"
// O - output
// I - input (macro doesn't change it)
// I[0/1] - offset in bytes
#define LOAD_IN_X2(O0, O1, \
I0, I1) \
"lh %[" #O0 "], " #I0 "(%[in]) \n\t" \
"lh %[" #O1 "], " #I1 "(%[in]) \n\t"
// I0 - location
// I1..I9 - offsets in bytes
#define LOAD_WITH_OFFSET_X4(O0, O1, O2, O3, \
I0, I1, I2, I3, I4, I5, I6, I7, I8, I9) \
"ulw %[" #O0 "], " #I1 "+" XSTR(I9) "*" #I5 "(%[" #I0 "]) \n\t" \
"ulw %[" #O1 "], " #I2 "+" XSTR(I9) "*" #I6 "(%[" #I0 "]) \n\t" \
"ulw %[" #O2 "], " #I3 "+" XSTR(I9) "*" #I7 "(%[" #I0 "]) \n\t" \
"ulw %[" #O3 "], " #I4 "+" XSTR(I9) "*" #I8 "(%[" #I0 "]) \n\t"
// O - output
// IO - input/output
// I - input (macro doesn't change it)
#define MUL_SHIFT_SUM(O0, O1, O2, O3, O4, O5, O6, O7, \
IO0, IO1, IO2, IO3, \
I0, I1, I2, I3, I4, I5, I6, I7) \
"mul %[" #O0 "], %[" #I0 "], %[kC2] \n\t" \
"mul %[" #O1 "], %[" #I0 "], %[kC1] \n\t" \
"mul %[" #O2 "], %[" #I1 "], %[kC2] \n\t" \
"mul %[" #O3 "], %[" #I1 "], %[kC1] \n\t" \
"mul %[" #O4 "], %[" #I2 "], %[kC2] \n\t" \
"mul %[" #O5 "], %[" #I2 "], %[kC1] \n\t" \
"mul %[" #O6 "], %[" #I3 "], %[kC2] \n\t" \
"mul %[" #O7 "], %[" #I3 "], %[kC1] \n\t" \
"sra %[" #O0 "], %[" #O0 "], 16 \n\t" \
"sra %[" #O1 "], %[" #O1 "], 16 \n\t" \
"sra %[" #O2 "], %[" #O2 "], 16 \n\t" \
"sra %[" #O3 "], %[" #O3 "], 16 \n\t" \
"sra %[" #O4 "], %[" #O4 "], 16 \n\t" \
"sra %[" #O5 "], %[" #O5 "], 16 \n\t" \
"sra %[" #O6 "], %[" #O6 "], 16 \n\t" \
"sra %[" #O7 "], %[" #O7 "], 16 \n\t" \
"addu %[" #IO0 "], %[" #IO0 "], %[" #I4 "] \n\t" \
"addu %[" #IO1 "], %[" #IO1 "], %[" #I5 "] \n\t" \
"subu %[" #IO2 "], %[" #IO2 "], %[" #I6 "] \n\t" \
"subu %[" #IO3 "], %[" #IO3 "], %[" #I7 "] \n\t"
// O - output
// I - input (macro doesn't change it)
#define INSERT_HALF_X2(O0, O1, \
I0, I1) \
"ins %[" #O0 "], %[" #I0 "], 16, 16 \n\t" \
"ins %[" #O1 "], %[" #I1 "], 16, 16 \n\t"
// O - output
// I - input (macro doesn't change it)
#define SRA_16(O0, O1, O2, O3, \
I0, I1, I2, I3) \
"sra %[" #O0 "], %[" #I0 "], 16 \n\t" \
"sra %[" #O1 "], %[" #I1 "], 16 \n\t" \
"sra %[" #O2 "], %[" #I2 "], 16 \n\t" \
"sra %[" #O3 "], %[" #I3 "], 16 \n\t"
// temp0[31..16 | 15..0] = temp8[31..16 | 15..0] + temp12[31..16 | 15..0]
// temp1[31..16 | 15..0] = temp8[31..16 | 15..0] - temp12[31..16 | 15..0]
// temp0[31..16 | 15..0] = temp0[31..16 >> 3 | 15..0 >> 3]
// temp1[31..16 | 15..0] = temp1[31..16 >> 3 | 15..0 >> 3]
// O - output
// I - input (macro doesn't change it)
#define SHIFT_R_SUM_X2(O0, O1, O2, O3, O4, O5, O6, O7, \
I0, I1, I2, I3, I4, I5, I6, I7) \
"addq.ph %[" #O0 "], %[" #I0 "], %[" #I4 "] \n\t" \
"subq.ph %[" #O1 "], %[" #I0 "], %[" #I4 "] \n\t" \
"addq.ph %[" #O2 "], %[" #I1 "], %[" #I5 "] \n\t" \
"subq.ph %[" #O3 "], %[" #I1 "], %[" #I5 "] \n\t" \
"addq.ph %[" #O4 "], %[" #I2 "], %[" #I6 "] \n\t" \
"subq.ph %[" #O5 "], %[" #I2 "], %[" #I6 "] \n\t" \
"addq.ph %[" #O6 "], %[" #I3 "], %[" #I7 "] \n\t" \
"subq.ph %[" #O7 "], %[" #I3 "], %[" #I7 "] \n\t" \
"shra.ph %[" #O0 "], %[" #O0 "], 3 \n\t" \
"shra.ph %[" #O1 "], %[" #O1 "], 3 \n\t" \
"shra.ph %[" #O2 "], %[" #O2 "], 3 \n\t" \
"shra.ph %[" #O3 "], %[" #O3 "], 3 \n\t" \
"shra.ph %[" #O4 "], %[" #O4 "], 3 \n\t" \
"shra.ph %[" #O5 "], %[" #O5 "], 3 \n\t" \
"shra.ph %[" #O6 "], %[" #O6 "], 3 \n\t" \
"shra.ph %[" #O7 "], %[" #O7 "], 3 \n\t"
// precrq.ph.w temp0, temp8, temp2
// temp0 = temp8[31..16] | temp2[31..16]
// ins temp2, temp8, 16, 16
// temp2 = temp8[31..16] | temp2[15..0]
// O - output
// IO - input/output
// I - input (macro doesn't change it)
#define PACK_2_HALVES_TO_WORD(O0, O1, O2, O3, \
IO0, IO1, IO2, IO3, \
I0, I1, I2, I3) \
"precrq.ph.w %[" #O0 "], %[" #I0 "], %[" #IO0 "] \n\t" \
"precrq.ph.w %[" #O1 "], %[" #I1 "], %[" #IO1 "] \n\t" \
"ins %[" #IO0 "], %[" #I0 "], 16, 16 \n\t" \
"ins %[" #IO1 "], %[" #I1 "], 16, 16 \n\t" \
"precrq.ph.w %[" #O2 "], %[" #I2 "], %[" #IO2 "] \n\t" \
"precrq.ph.w %[" #O3 "], %[" #I3 "], %[" #IO3 "] \n\t" \
"ins %[" #IO2 "], %[" #I2 "], 16, 16 \n\t" \
"ins %[" #IO3 "], %[" #I3 "], 16, 16 \n\t"
// preceu.ph.qbr temp0, temp8
// temp0 = 0 | 0 | temp8[23..16] | temp8[7..0]
// preceu.ph.qbl temp1, temp8
// temp1 = temp8[23..16] | temp8[7..0] | 0 | 0
// O - output
// I - input (macro doesn't change it)
#define CONVERT_2_BYTES_TO_HALF(O0, O1, O2, O3, O4, O5, O6, O7, \
I0, I1, I2, I3) \
"preceu.ph.qbr %[" #O0 "], %[" #I0 "] \n\t" \
"preceu.ph.qbl %[" #O1 "], %[" #I0 "] \n\t" \
"preceu.ph.qbr %[" #O2 "], %[" #I1 "] \n\t" \
"preceu.ph.qbl %[" #O3 "], %[" #I1 "] \n\t" \
"preceu.ph.qbr %[" #O4 "], %[" #I2 "] \n\t" \
"preceu.ph.qbl %[" #O5 "], %[" #I2 "] \n\t" \
"preceu.ph.qbr %[" #O6 "], %[" #I3 "] \n\t" \
"preceu.ph.qbl %[" #O7 "], %[" #I3 "] \n\t"
// temp0[31..16 | 15..0] = temp0[31..16 | 15..0] + temp8[31..16 | 15..0]
// temp0[31..16 | 15..0] = temp0[31..16 <<(s) 7 | 15..0 <<(s) 7]
// temp1..temp7 same as temp0
// precrqu_s.qb.ph temp0, temp1, temp0:
// temp0 = temp1[31..24] | temp1[15..8] | temp0[31..24] | temp0[15..8]
// store temp0 to dst
// IO - input/output
// I - input (macro doesn't change it)
#define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7, \
I0, I1, I2, I3, I4, I5, I6, I7, \
I8, I9, I10, I11, I12, I13) \
"addq.ph %[" #IO0 "], %[" #IO0 "], %[" #I0 "] \n\t" \
"addq.ph %[" #IO1 "], %[" #IO1 "], %[" #I1 "] \n\t" \
"addq.ph %[" #IO2 "], %[" #IO2 "], %[" #I2 "] \n\t" \
"addq.ph %[" #IO3 "], %[" #IO3 "], %[" #I3 "] \n\t" \
"addq.ph %[" #IO4 "], %[" #IO4 "], %[" #I4 "] \n\t" \
"addq.ph %[" #IO5 "], %[" #IO5 "], %[" #I5 "] \n\t" \
"addq.ph %[" #IO6 "], %[" #IO6 "], %[" #I6 "] \n\t" \
"addq.ph %[" #IO7 "], %[" #IO7 "], %[" #I7 "] \n\t" \
"shll_s.ph %[" #IO0 "], %[" #IO0 "], 7 \n\t" \
"shll_s.ph %[" #IO1 "], %[" #IO1 "], 7 \n\t" \
"shll_s.ph %[" #IO2 "], %[" #IO2 "], 7 \n\t" \
"shll_s.ph %[" #IO3 "], %[" #IO3 "], 7 \n\t" \
"shll_s.ph %[" #IO4 "], %[" #IO4 "], 7 \n\t" \
"shll_s.ph %[" #IO5 "], %[" #IO5 "], 7 \n\t" \
"shll_s.ph %[" #IO6 "], %[" #IO6 "], 7 \n\t" \
"shll_s.ph %[" #IO7 "], %[" #IO7 "], 7 \n\t" \
"precrqu_s.qb.ph %[" #IO0 "], %[" #IO1 "], %[" #IO0 "] \n\t" \
"precrqu_s.qb.ph %[" #IO2 "], %[" #IO3 "], %[" #IO2 "] \n\t" \
"precrqu_s.qb.ph %[" #IO4 "], %[" #IO5 "], %[" #IO4 "] \n\t" \
"precrqu_s.qb.ph %[" #IO6 "], %[" #IO7 "], %[" #IO6 "] \n\t" \
"usw %[" #IO0 "], " XSTR(I13) "*" #I9 "(%[" #I8 "]) \n\t" \
"usw %[" #IO2 "], " XSTR(I13) "*" #I10 "(%[" #I8 "]) \n\t" \
"usw %[" #IO4 "], " XSTR(I13) "*" #I11 "(%[" #I8 "]) \n\t" \
"usw %[" #IO6 "], " XSTR(I13) "*" #I12 "(%[" #I8 "]) \n\t"
#define OUTPUT_EARLY_CLOBBER_REGS_10() \
: [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), \
[temp7]"=&r"(temp7), [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), \
[temp10]"=&r"(temp10)
#define OUTPUT_EARLY_CLOBBER_REGS_18() \
OUTPUT_EARLY_CLOBBER_REGS_10(), \
[temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13), \
[temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16), \
[temp17]"=&r"(temp17), [temp18]"=&r"(temp18)
#endif // WEBP_DSP_MIPS_MACRO_H_

1392
vendor/github.com/sizeofint/webpanimation/dsp_msa_macro.h generated vendored Normal file

File diff suppressed because it is too large Load Diff

101
vendor/github.com/sizeofint/webpanimation/dsp_neon.h generated vendored Normal file
View File

@ -0,0 +1,101 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON common code.
#ifndef WEBP_DSP_NEON_H_
#define WEBP_DSP_NEON_H_
#include <arm_neon.h>
#include "dsp_dsp.h"
// Right now, some intrinsics functions seem slower, so we disable them
// everywhere except newer clang/gcc or aarch64 where the inline assembly is
// incompatible.
#if LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,9) || defined(__aarch64__)
#define WEBP_USE_INTRINSICS // use intrinsics when possible
#endif
#define INIT_VECTOR2(v, a, b) do { \
v.val[0] = a; \
v.val[1] = b; \
} while (0)
#define INIT_VECTOR3(v, a, b, c) do { \
v.val[0] = a; \
v.val[1] = b; \
v.val[2] = c; \
} while (0)
#define INIT_VECTOR4(v, a, b, c, d) do { \
v.val[0] = a; \
v.val[1] = b; \
v.val[2] = c; \
v.val[3] = d; \
} while (0)
// if using intrinsics, this flag avoids some functions that make gcc-4.6.3
// crash ("internal compiler error: in immed_double_const, at emit-rtl.").
// (probably similar to gcc.gnu.org/bugzilla/show_bug.cgi?id=48183)
#if !(LOCAL_CLANG_PREREQ(3,8) || LOCAL_GCC_PREREQ(4,8) || defined(__aarch64__))
#define WORK_AROUND_GCC
#endif
static WEBP_INLINE int32x4x4_t Transpose4x4_NEON(const int32x4x4_t rows) {
uint64x2x2_t row01, row23;
row01.val[0] = vreinterpretq_u64_s32(rows.val[0]);
row01.val[1] = vreinterpretq_u64_s32(rows.val[1]);
row23.val[0] = vreinterpretq_u64_s32(rows.val[2]);
row23.val[1] = vreinterpretq_u64_s32(rows.val[3]);
// Transpose 64-bit values (there's no vswp equivalent)
{
const uint64x1_t row0h = vget_high_u64(row01.val[0]);
const uint64x1_t row2l = vget_low_u64(row23.val[0]);
const uint64x1_t row1h = vget_high_u64(row01.val[1]);
const uint64x1_t row3l = vget_low_u64(row23.val[1]);
row01.val[0] = vcombine_u64(vget_low_u64(row01.val[0]), row2l);
row23.val[0] = vcombine_u64(row0h, vget_high_u64(row23.val[0]));
row01.val[1] = vcombine_u64(vget_low_u64(row01.val[1]), row3l);
row23.val[1] = vcombine_u64(row1h, vget_high_u64(row23.val[1]));
}
{
const int32x4x2_t out01 = vtrnq_s32(vreinterpretq_s32_u64(row01.val[0]),
vreinterpretq_s32_u64(row01.val[1]));
const int32x4x2_t out23 = vtrnq_s32(vreinterpretq_s32_u64(row23.val[0]),
vreinterpretq_s32_u64(row23.val[1]));
int32x4x4_t out;
out.val[0] = out01.val[0];
out.val[1] = out01.val[1];
out.val[2] = out23.val[0];
out.val[3] = out23.val[1];
return out;
}
}
#if 0 // Useful debug macro.
#include <stdio.h>
#define PRINT_REG(REG, SIZE) do { \
int i; \
printf("%s \t[%d]: 0x", #REG, SIZE); \
if (SIZE == 8) { \
uint8_t _tmp[8]; \
vst1_u8(_tmp, (REG)); \
for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]); \
} else if (SIZE == 16) { \
uint16_t _tmp[4]; \
vst1_u16(_tmp, (REG)); \
for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]); \
} \
printf("\n"); \
} while (0)
#endif
#endif // WEBP_DSP_NEON_H_

85
vendor/github.com/sizeofint/webpanimation/dsp_quant.h generated vendored Normal file
View File

@ -0,0 +1,85 @@
// Copyright 2018 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
#ifndef WEBP_DSP_QUANT_H_
#define WEBP_DSP_QUANT_H_
#include <string.h>
#include "dsp_dsp.h"
#include "webp_types.h"
#if defined(WEBP_USE_NEON) && !defined(WEBP_ANDROID_NEON) && \
!defined(WEBP_HAVE_NEON_RTCD)
#include <arm_neon.h>
#define IsFlat IsFlat_NEON
static uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
const uint64x2_t b = vpaddlq_u32(a);
return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
vreinterpret_u32_u64(vget_high_u64(b)));
}
static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
int thresh) {
const int16x8_t tst_ones = vdupq_n_s16(-1);
uint32x4_t sum = vdupq_n_u32(0);
for (int i = 0; i < num_blocks; ++i) {
// Set DC to zero.
const int16x8_t a_0 = vsetq_lane_s16(0, vld1q_s16(levels), 0);
const int16x8_t a_1 = vld1q_s16(levels + 8);
const uint16x8_t b_0 = vshrq_n_u16(vtstq_s16(a_0, tst_ones), 15);
const uint16x8_t b_1 = vshrq_n_u16(vtstq_s16(a_1, tst_ones), 15);
sum = vpadalq_u16(sum, b_0);
sum = vpadalq_u16(sum, b_1);
levels += 16;
}
return thresh >= (int32_t)vget_lane_u32(horizontal_add_uint32x4(sum), 0);
}
#else
#define IsFlat IsFlat_C
static WEBP_INLINE int IsFlat(const int16_t* levels, int num_blocks,
int thresh) {
int score = 0;
while (num_blocks-- > 0) { // TODO(skal): refine positional scoring?
int i;
for (i = 1; i < 16; ++i) { // omit DC, we're only interested in AC
score += (levels[i] != 0);
if (score > thresh) return 0;
}
levels += 16;
}
return 1;
}
#endif // defined(WEBP_USE_NEON) && !defined(WEBP_ANDROID_NEON) &&
// !defined(WEBP_HAVE_NEON_RTCD)
static WEBP_INLINE int IsFlatSource16(const uint8_t* src) {
const uint32_t v = src[0] * 0x01010101u;
int i;
for (i = 0; i < 16; ++i) {
if (memcmp(src + 0, &v, 4) || memcmp(src + 4, &v, 4) ||
memcmp(src + 8, &v, 4) || memcmp(src + 12, &v, 4)) {
return 0;
}
src += BPS;
}
return 1;
}
#endif // WEBP_DSP_QUANT_H_

View File

@ -0,0 +1,250 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Rescaling functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include "dsp_dsp.h"
#include "utils_rescaler_utils.h"
//------------------------------------------------------------------------------
// Implementations of critical functions ImportRow / ExportRow
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
//------------------------------------------------------------------------------
// Row import
void WebPRescalerImportRowExpand_C(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
assert(!WebPRescalerInputDone(wrk));
assert(wrk->x_expand);
for (channel = 0; channel < x_stride; ++channel) {
int x_in = channel;
int x_out = channel;
// simple bilinear interpolation
int accum = wrk->x_add;
int left = src[x_in];
int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
x_in += x_stride;
while (1) {
wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
x_out += x_stride;
if (x_out >= x_out_max) break;
accum -= wrk->x_sub;
if (accum < 0) {
left = right;
x_in += x_stride;
assert(x_in < wrk->src_width * x_stride);
right = src[x_in];
accum += wrk->x_add;
}
}
assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
}
}
void WebPRescalerImportRowShrink_C(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
int channel;
assert(!WebPRescalerInputDone(wrk));
assert(!wrk->x_expand);
for (channel = 0; channel < x_stride; ++channel) {
int x_in = channel;
int x_out = channel;
uint32_t sum = 0;
int accum = 0;
while (x_out < x_out_max) {
uint32_t base = 0;
accum += wrk->x_add;
while (accum > 0) {
accum -= wrk->x_sub;
assert(x_in < wrk->src_width * x_stride);
base = src[x_in];
sum += base;
x_in += x_stride;
}
{ // Emit next horizontal pixel.
const rescaler_t frac = base * (-accum);
wrk->frow[x_out] = sum * wrk->x_sub - frac;
// fresh fractional start for next pixel
sum = (int)MULT_FIX(frac, wrk->fx_scale);
}
x_out += x_stride;
}
assert(accum == 0);
}
}
//------------------------------------------------------------------------------
// Row export
void WebPRescalerExportRowExpand_C(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* const frow = wrk->frow;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(wrk->y_expand);
assert(wrk->y_sub != 0);
if (wrk->y_accum == 0) {
for (x_out = 0; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
const int v = (int)MULT_FIX(J, wrk->fy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
for (x_out = 0; x_out < x_out_max; ++x_out) {
const uint64_t I = (uint64_t)A * frow[x_out]
+ (uint64_t)B * irow[x_out];
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX(J, wrk->fy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
}
}
void WebPRescalerExportRowShrink_C(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* const frow = wrk->frow;
const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
if (yscale) {
for (x_out = 0; x_out < x_out_max; ++x_out) {
const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = frac; // new fractional start
}
} else {
for (x_out = 0; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = 0;
}
}
}
#undef MULT_FIX_FLOOR
#undef MULT_FIX
#undef ROUNDER
//------------------------------------------------------------------------------
// Main entry calls
void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) {
assert(!WebPRescalerInputDone(wrk));
if (!wrk->x_expand) {
WebPRescalerImportRowShrink(wrk, src);
} else {
WebPRescalerImportRowExpand(wrk, src);
}
}
void WebPRescalerExportRow(WebPRescaler* const wrk) {
if (wrk->y_accum <= 0) {
assert(!WebPRescalerOutputDone(wrk));
if (wrk->y_expand) {
WebPRescalerExportRowExpand(wrk);
} else if (wrk->fxy_scale) {
WebPRescalerExportRowShrink(wrk);
} else { // special case
int i;
assert(wrk->src_height == wrk->dst_height && wrk->x_add == 1);
assert(wrk->src_width == 1 && wrk->dst_width <= 2);
for (i = 0; i < wrk->num_channels * wrk->dst_width; ++i) {
wrk->dst[i] = wrk->irow[i];
wrk->irow[i] = 0;
}
}
wrk->y_accum += wrk->y_add;
wrk->dst += wrk->dst_stride;
++wrk->dst_y;
}
}
//------------------------------------------------------------------------------
WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
extern void WebPRescalerDspInitSSE2(void);
extern void WebPRescalerDspInitMIPS32(void);
extern void WebPRescalerDspInitMIPSdspR2(void);
extern void WebPRescalerDspInitMSA(void);
extern void WebPRescalerDspInitNEON(void);
WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
#if !defined(WEBP_REDUCE_SIZE)
#if !WEBP_NEON_OMIT_C_CODE
WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
WebPRescalerExportRowShrink = WebPRescalerExportRowShrink_C;
#endif
WebPRescalerImportRowExpand = WebPRescalerImportRowExpand_C;
WebPRescalerImportRowShrink = WebPRescalerImportRowShrink_C;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPRescalerDspInitSSE2();
}
#endif
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
WebPRescalerDspInitMIPS32();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPRescalerDspInitMIPSdspR2();
}
#endif
#if defined(WEBP_USE_MSA)
if (VP8GetCPUInfo(kMSA)) {
WebPRescalerDspInitMSA();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPRescalerDspInitNEON();
}
#endif
assert(WebPRescalerExportRowExpand != NULL);
assert(WebPRescalerExportRowShrink != NULL);
assert(WebPRescalerImportRowExpand != NULL);
assert(WebPRescalerImportRowShrink != NULL);
#endif // WEBP_REDUCE_SIZE
}

View File

@ -0,0 +1,295 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of rescaling functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS32) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
#include "utils_rescaler_utils.h"
//------------------------------------------------------------------------------
// Row import
static void ImportRowShrink_MIPS32(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int fx_scale = wrk->fx_scale;
const int x_add = wrk->x_add;
const int x_sub = wrk->x_sub;
const int x_stride1 = x_stride << 2;
int channel;
assert(!wrk->x_expand);
assert(!WebPRescalerInputDone(wrk));
for (channel = 0; channel < x_stride; ++channel) {
const uint8_t* src1 = src + channel;
rescaler_t* frow = wrk->frow + channel;
int temp1, temp2, temp3;
int base, frac, sum;
int accum, accum1;
int loop_c = x_out_max - channel;
__asm__ volatile (
"li %[temp1], 0x8000 \n\t"
"li %[temp2], 0x10000 \n\t"
"li %[sum], 0 \n\t"
"li %[accum], 0 \n\t"
"1: \n\t"
"addu %[accum], %[accum], %[x_add] \n\t"
"li %[base], 0 \n\t"
"blez %[accum], 3f \n\t"
"2: \n\t"
"lbu %[base], 0(%[src1]) \n\t"
"subu %[accum], %[accum], %[x_sub] \n\t"
"addu %[src1], %[src1], %[x_stride] \n\t"
"addu %[sum], %[sum], %[base] \n\t"
"bgtz %[accum], 2b \n\t"
"3: \n\t"
"negu %[accum1], %[accum] \n\t"
"mul %[frac], %[base], %[accum1] \n\t"
"mul %[temp3], %[sum], %[x_sub] \n\t"
"subu %[loop_c], %[loop_c], %[x_stride] \n\t"
"mult %[temp1], %[temp2] \n\t"
"maddu %[frac], %[fx_scale] \n\t"
"mfhi %[sum] \n\t"
"subu %[temp3], %[temp3], %[frac] \n\t"
"sw %[temp3], 0(%[frow]) \n\t"
"addu %[frow], %[frow], %[x_stride1] \n\t"
"bgtz %[loop_c], 1b \n\t"
: [accum]"=&r"(accum), [src1]"+r"(src1), [temp3]"=&r"(temp3),
[sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
[frow]"+r"(frow), [accum1]"=&r"(accum1),
[temp2]"=&r"(temp2), [temp1]"=&r"(temp1)
: [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale),
[x_sub]"r"(x_sub), [x_add]"r"(x_add),
[loop_c]"r"(loop_c), [x_stride1]"r"(x_stride1)
: "memory", "hi", "lo"
);
assert(accum == 0);
}
}
static void ImportRowExpand_MIPS32(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_stride = wrk->num_channels;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
const int x_sub = wrk->x_sub;
const int src_width = wrk->src_width;
const int x_stride1 = x_stride << 2;
int channel;
assert(wrk->x_expand);
assert(!WebPRescalerInputDone(wrk));
for (channel = 0; channel < x_stride; ++channel) {
const uint8_t* src1 = src + channel;
rescaler_t* frow = wrk->frow + channel;
int temp1, temp2, temp3, temp4;
int frac;
int accum;
int x_out = channel;
__asm__ volatile (
"addiu %[temp3], %[src_width], -1 \n\t"
"lbu %[temp2], 0(%[src1]) \n\t"
"addu %[src1], %[src1], %[x_stride] \n\t"
"bgtz %[temp3], 0f \n\t"
"addiu %[temp1], %[temp2], 0 \n\t"
"b 3f \n\t"
"0: \n\t"
"lbu %[temp1], 0(%[src1]) \n\t"
"3: \n\t"
"addiu %[accum], %[x_add], 0 \n\t"
"1: \n\t"
"subu %[temp3], %[temp2], %[temp1] \n\t"
"mul %[temp3], %[temp3], %[accum] \n\t"
"mul %[temp4], %[temp1], %[x_add] \n\t"
"addu %[temp3], %[temp4], %[temp3] \n\t"
"sw %[temp3], 0(%[frow]) \n\t"
"addu %[frow], %[frow], %[x_stride1] \n\t"
"addu %[x_out], %[x_out], %[x_stride] \n\t"
"subu %[temp3], %[x_out], %[x_out_max] \n\t"
"bgez %[temp3], 2f \n\t"
"subu %[accum], %[accum], %[x_sub] \n\t"
"bgez %[accum], 4f \n\t"
"addiu %[temp2], %[temp1], 0 \n\t"
"addu %[src1], %[src1], %[x_stride] \n\t"
"lbu %[temp1], 0(%[src1]) \n\t"
"addu %[accum], %[accum], %[x_add] \n\t"
"4: \n\t"
"b 1b \n\t"
"2: \n\t"
: [src1]"+r"(src1), [accum]"=&r"(accum), [temp1]"=&r"(temp1),
[temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
[x_out]"+r"(x_out), [frac]"=&r"(frac), [frow]"+r"(frow)
: [x_stride]"r"(x_stride), [x_add]"r"(x_add), [x_sub]"r"(x_sub),
[x_stride1]"r"(x_stride1), [src_width]"r"(src_width),
[x_out_max]"r"(x_out_max)
: "memory", "hi", "lo"
);
assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
}
}
//------------------------------------------------------------------------------
// Row export
static void ExportRowExpand_MIPS32(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* frow = wrk->frow;
int temp0, temp1, temp3, temp4, temp5, loop_end;
const int temp2 = (int)wrk->fy_scale;
const int temp6 = x_out_max << 2;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(wrk->y_expand);
assert(wrk->y_sub != 0);
if (wrk->y_accum == 0) {
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"addiu %[dst], %[dst], 1 \n\t"
"addiu %[frow], %[frow], 4 \n\t"
"mult %[temp3], %[temp4] \n\t"
"maddu %[temp0], %[temp2] \n\t"
"mfhi %[temp5] \n\t"
"sb %[temp5], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[dst]"+r"(dst), [loop_end]"=&r"(loop_end)
: [temp2]"r"(temp2), [temp6]"r"(temp6)
: "memory", "hi", "lo"
);
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"lw %[temp1], 0(%[irow]) \n\t"
"addiu %[dst], %[dst], 1 \n\t"
"mult %[temp3], %[temp4] \n\t"
"maddu %[A], %[temp0] \n\t"
"maddu %[B], %[temp1] \n\t"
"addiu %[frow], %[frow], 4 \n\t"
"addiu %[irow], %[irow], 4 \n\t"
"mfhi %[temp5] \n\t"
"mult %[temp3], %[temp4] \n\t"
"maddu %[temp5], %[temp2] \n\t"
"mfhi %[temp5] \n\t"
"sb %[temp5], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
: [temp2]"r"(temp2), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
: "memory", "hi", "lo"
);
}
}
#if 0 // disabled for now. TODO(skal): make match the C-code
static void ExportRowShrink_MIPS32(WebPRescaler* const wrk) {
const int x_out_max = wrk->dst_width * wrk->num_channels;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const rescaler_t* frow = wrk->frow;
const int yscale = wrk->fy_scale * (-wrk->y_accum);
int temp0, temp1, temp3, temp4, temp5, loop_end;
const int temp2 = (int)wrk->fxy_scale;
const int temp6 = x_out_max << 2;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
assert(wrk->fxy_scale != 0);
if (yscale) {
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"mult %[temp3], %[temp4] \n\t"
"addiu %[frow], %[frow], 4 \n\t"
"maddu %[temp0], %[yscale] \n\t"
"mfhi %[temp1] \n\t"
"lw %[temp0], 0(%[irow]) \n\t"
"addiu %[dst], %[dst], 1 \n\t"
"addiu %[irow], %[irow], 4 \n\t"
"subu %[temp0], %[temp0], %[temp1] \n\t"
"mult %[temp3], %[temp4] \n\t"
"maddu %[temp0], %[temp2] \n\t"
"mfhi %[temp5] \n\t"
"sw %[temp1], -4(%[irow]) \n\t"
"sb %[temp5], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
: [temp2]"r"(temp2), [yscale]"r"(yscale), [temp6]"r"(temp6)
: "memory", "hi", "lo"
);
} else {
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[irow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[irow]) \n\t"
"addiu %[dst], %[dst], 1 \n\t"
"addiu %[irow], %[irow], 4 \n\t"
"mult %[temp3], %[temp4] \n\t"
"maddu %[temp0], %[temp2] \n\t"
"mfhi %[temp5] \n\t"
"sw $zero, -4(%[irow]) \n\t"
"sb %[temp5], -1(%[dst]) \n\t"
"bne %[irow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
[dst]"+r"(dst), [loop_end]"=&r"(loop_end)
: [temp2]"r"(temp2), [temp6]"r"(temp6)
: "memory", "hi", "lo"
);
}
}
#endif // 0
//------------------------------------------------------------------------------
// Entry point
extern void WebPRescalerDspInitMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
WebPRescalerImportRowExpand = ImportRowExpand_MIPS32;
WebPRescalerImportRowShrink = ImportRowShrink_MIPS32;
WebPRescalerExportRowExpand = ExportRowExpand_MIPS32;
// WebPRescalerExportRowShrink = ExportRowShrink_MIPS32;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPS32)
#endif // WEBP_USE_MIPS32

View File

@ -0,0 +1,314 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of rescaling functions
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
#include "utils_rescaler_utils.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
//------------------------------------------------------------------------------
// Row export
#if 0 // disabled for now. TODO(skal): make match the C-code
static void ExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
int i;
const int x_out_max = wrk->dst_width * wrk->num_channels;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const rescaler_t* frow = wrk->frow;
const int yscale = wrk->fy_scale * (-wrk->y_accum);
int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
const int temp7 = (int)wrk->fxy_scale;
const int temp6 = (x_out_max & ~0x3) << 2;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
assert(wrk->fxy_scale != 0);
if (yscale) {
if (x_out_max >= 4) {
int temp8, temp9, temp10, temp11;
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"lw %[temp1], 4(%[frow]) \n\t"
"lw %[temp2], 8(%[frow]) \n\t"
"lw %[temp5], 12(%[frow]) \n\t"
"mult $ac0, %[temp3], %[temp4] \n\t"
"maddu $ac0, %[temp0], %[yscale] \n\t"
"mult $ac1, %[temp3], %[temp4] \n\t"
"maddu $ac1, %[temp1], %[yscale] \n\t"
"mult $ac2, %[temp3], %[temp4] \n\t"
"maddu $ac2, %[temp2], %[yscale] \n\t"
"mult $ac3, %[temp3], %[temp4] \n\t"
"maddu $ac3, %[temp5], %[yscale] \n\t"
"addiu %[frow], %[frow], 16 \n\t"
"mfhi %[temp0], $ac0 \n\t"
"mfhi %[temp1], $ac1 \n\t"
"mfhi %[temp2], $ac2 \n\t"
"mfhi %[temp5], $ac3 \n\t"
"lw %[temp8], 0(%[irow]) \n\t"
"lw %[temp9], 4(%[irow]) \n\t"
"lw %[temp10], 8(%[irow]) \n\t"
"lw %[temp11], 12(%[irow]) \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"addiu %[irow], %[irow], 16 \n\t"
"subu %[temp8], %[temp8], %[temp0] \n\t"
"subu %[temp9], %[temp9], %[temp1] \n\t"
"subu %[temp10], %[temp10], %[temp2] \n\t"
"subu %[temp11], %[temp11], %[temp5] \n\t"
"mult $ac0, %[temp3], %[temp4] \n\t"
"maddu $ac0, %[temp8], %[temp7] \n\t"
"mult $ac1, %[temp3], %[temp4] \n\t"
"maddu $ac1, %[temp9], %[temp7] \n\t"
"mult $ac2, %[temp3], %[temp4] \n\t"
"maddu $ac2, %[temp10], %[temp7] \n\t"
"mult $ac3, %[temp3], %[temp4] \n\t"
"maddu $ac3, %[temp11], %[temp7] \n\t"
"mfhi %[temp8], $ac0 \n\t"
"mfhi %[temp9], $ac1 \n\t"
"mfhi %[temp10], $ac2 \n\t"
"mfhi %[temp11], $ac3 \n\t"
"sw %[temp0], -16(%[irow]) \n\t"
"sw %[temp1], -12(%[irow]) \n\t"
"sw %[temp2], -8(%[irow]) \n\t"
"sw %[temp5], -4(%[irow]) \n\t"
"sb %[temp8], -4(%[dst]) \n\t"
"sb %[temp9], -3(%[dst]) \n\t"
"sb %[temp10], -2(%[dst]) \n\t"
"sb %[temp11], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
[temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
[temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
: [temp7]"r"(temp7), [yscale]"r"(yscale), [temp6]"r"(temp6)
: "memory", "hi", "lo", "$ac1hi", "$ac1lo",
"$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
);
}
for (i = 0; i < (x_out_max & 0x3); ++i) {
const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(*frow++, yscale);
const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale);
*dst++ = (v > 255) ? 255u : (uint8_t)v;
*irow++ = frac; // new fractional start
}
} else {
if (x_out_max >= 4) {
__asm__ volatile (
"li %[temp3], 0x10000 \n\t"
"li %[temp4], 0x8000 \n\t"
"addu %[loop_end], %[irow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[irow]) \n\t"
"lw %[temp1], 4(%[irow]) \n\t"
"lw %[temp2], 8(%[irow]) \n\t"
"lw %[temp5], 12(%[irow]) \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"addiu %[irow], %[irow], 16 \n\t"
"mult $ac0, %[temp3], %[temp4] \n\t"
"maddu $ac0, %[temp0], %[temp7] \n\t"
"mult $ac1, %[temp3], %[temp4] \n\t"
"maddu $ac1, %[temp1], %[temp7] \n\t"
"mult $ac2, %[temp3], %[temp4] \n\t"
"maddu $ac2, %[temp2], %[temp7] \n\t"
"mult $ac3, %[temp3], %[temp4] \n\t"
"maddu $ac3, %[temp5], %[temp7] \n\t"
"mfhi %[temp0], $ac0 \n\t"
"mfhi %[temp1], $ac1 \n\t"
"mfhi %[temp2], $ac2 \n\t"
"mfhi %[temp5], $ac3 \n\t"
"sw $zero, -16(%[irow]) \n\t"
"sw $zero, -12(%[irow]) \n\t"
"sw $zero, -8(%[irow]) \n\t"
"sw $zero, -4(%[irow]) \n\t"
"sb %[temp0], -4(%[dst]) \n\t"
"sb %[temp1], -3(%[dst]) \n\t"
"sb %[temp2], -2(%[dst]) \n\t"
"sb %[temp5], -1(%[dst]) \n\t"
"bne %[irow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
[dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
: [temp7]"r"(temp7), [temp6]"r"(temp6)
: "memory", "hi", "lo", "$ac1hi", "$ac1lo",
"$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
);
}
for (i = 0; i < (x_out_max & 0x3); ++i) {
const int v = (int)MULT_FIX_FLOOR(*irow, wrk->fxy_scale);
*dst++ = (v > 255) ? 255u : (uint8_t)v;
*irow++ = 0;
}
}
}
#endif // 0
static void ExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
int i;
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* frow = wrk->frow;
int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
const int temp6 = (x_out_max & ~0x3) << 2;
const int temp7 = (int)wrk->fy_scale;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(wrk->y_expand);
assert(wrk->y_sub != 0);
if (wrk->y_accum == 0) {
if (x_out_max >= 4) {
__asm__ volatile (
"li %[temp4], 0x10000 \n\t"
"li %[temp5], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"lw %[temp1], 4(%[frow]) \n\t"
"lw %[temp2], 8(%[frow]) \n\t"
"lw %[temp3], 12(%[frow]) \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"addiu %[frow], %[frow], 16 \n\t"
"mult $ac0, %[temp4], %[temp5] \n\t"
"maddu $ac0, %[temp0], %[temp7] \n\t"
"mult $ac1, %[temp4], %[temp5] \n\t"
"maddu $ac1, %[temp1], %[temp7] \n\t"
"mult $ac2, %[temp4], %[temp5] \n\t"
"maddu $ac2, %[temp2], %[temp7] \n\t"
"mult $ac3, %[temp4], %[temp5] \n\t"
"maddu $ac3, %[temp3], %[temp7] \n\t"
"mfhi %[temp0], $ac0 \n\t"
"mfhi %[temp1], $ac1 \n\t"
"mfhi %[temp2], $ac2 \n\t"
"mfhi %[temp3], $ac3 \n\t"
"sb %[temp0], -4(%[dst]) \n\t"
"sb %[temp1], -3(%[dst]) \n\t"
"sb %[temp2], -2(%[dst]) \n\t"
"sb %[temp3], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
: [temp7]"r"(temp7), [temp6]"r"(temp6)
: "memory", "hi", "lo", "$ac1hi", "$ac1lo",
"$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
);
}
for (i = 0; i < (x_out_max & 0x3); ++i) {
const uint32_t J = *frow++;
const int v = (int)MULT_FIX(J, wrk->fy_scale);
*dst++ = (v > 255) ? 255u : (uint8_t)v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
if (x_out_max >= 4) {
int temp8, temp9, temp10, temp11;
__asm__ volatile (
"li %[temp8], 0x10000 \n\t"
"li %[temp9], 0x8000 \n\t"
"addu %[loop_end], %[frow], %[temp6] \n\t"
"1: \n\t"
"lw %[temp0], 0(%[frow]) \n\t"
"lw %[temp1], 4(%[frow]) \n\t"
"lw %[temp2], 8(%[frow]) \n\t"
"lw %[temp3], 12(%[frow]) \n\t"
"lw %[temp4], 0(%[irow]) \n\t"
"lw %[temp5], 4(%[irow]) \n\t"
"lw %[temp10], 8(%[irow]) \n\t"
"lw %[temp11], 12(%[irow]) \n\t"
"addiu %[dst], %[dst], 4 \n\t"
"mult $ac0, %[temp8], %[temp9] \n\t"
"maddu $ac0, %[A], %[temp0] \n\t"
"maddu $ac0, %[B], %[temp4] \n\t"
"mult $ac1, %[temp8], %[temp9] \n\t"
"maddu $ac1, %[A], %[temp1] \n\t"
"maddu $ac1, %[B], %[temp5] \n\t"
"mult $ac2, %[temp8], %[temp9] \n\t"
"maddu $ac2, %[A], %[temp2] \n\t"
"maddu $ac2, %[B], %[temp10] \n\t"
"mult $ac3, %[temp8], %[temp9] \n\t"
"maddu $ac3, %[A], %[temp3] \n\t"
"maddu $ac3, %[B], %[temp11] \n\t"
"addiu %[frow], %[frow], 16 \n\t"
"addiu %[irow], %[irow], 16 \n\t"
"mfhi %[temp0], $ac0 \n\t"
"mfhi %[temp1], $ac1 \n\t"
"mfhi %[temp2], $ac2 \n\t"
"mfhi %[temp3], $ac3 \n\t"
"mult $ac0, %[temp8], %[temp9] \n\t"
"maddu $ac0, %[temp0], %[temp7] \n\t"
"mult $ac1, %[temp8], %[temp9] \n\t"
"maddu $ac1, %[temp1], %[temp7] \n\t"
"mult $ac2, %[temp8], %[temp9] \n\t"
"maddu $ac2, %[temp2], %[temp7] \n\t"
"mult $ac3, %[temp8], %[temp9] \n\t"
"maddu $ac3, %[temp3], %[temp7] \n\t"
"mfhi %[temp0], $ac0 \n\t"
"mfhi %[temp1], $ac1 \n\t"
"mfhi %[temp2], $ac2 \n\t"
"mfhi %[temp3], $ac3 \n\t"
"sb %[temp0], -4(%[dst]) \n\t"
"sb %[temp1], -3(%[dst]) \n\t"
"sb %[temp2], -2(%[dst]) \n\t"
"sb %[temp3], -1(%[dst]) \n\t"
"bne %[frow], %[loop_end], 1b \n\t"
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
[temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
[irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
[temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
[temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
: [temp7]"r"(temp7), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
: "memory", "hi", "lo", "$ac1hi", "$ac1lo",
"$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
);
}
for (i = 0; i < (x_out_max & 0x3); ++i) {
const uint64_t I = (uint64_t)A * *frow++
+ (uint64_t)B * *irow++;
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX(J, wrk->fy_scale);
*dst++ = (v > 255) ? 255u : (uint8_t)v;
}
}
}
#undef MULT_FIX_FLOOR
#undef MULT_FIX
#undef ROUNDER
//------------------------------------------------------------------------------
// Entry point
extern void WebPRescalerDspInitMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
WebPRescalerExportRowExpand = ExportRowExpand_MIPSdspR2;
// WebPRescalerExportRowShrink = ExportRowShrink_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View File

@ -0,0 +1,443 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MSA version of rescaling functions
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MSA) && !defined(WEBP_REDUCE_SIZE)
#include <assert.h>
#include "utils_rescaler_utils.h"
#include "dsp_msa_macro.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do { \
v4u32 tmp0, tmp1, tmp2, tmp3; \
v16u8 t0, t1, t2, t3, t4, t5; \
v2u64 out0, out1, out2, out3; \
ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
SRAR_D4_UD(out0, out1, out2, out3, shift); \
PCKEV_B2_UB(out1, out0, out3, out2, t0, t1); \
ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3); \
SRAR_D4_UD(out0, out1, out2, out3, shift); \
PCKEV_B2_UB(out1, out0, out3, out2, t2, t3); \
PCKEV_B2_UB(t1, t0, t3, t2, t4, t5); \
dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4); \
} while (0)
#define CALC_MULT_FIX_4(in0, scale, shift, dst) do { \
v4u32 tmp0, tmp1; \
v16i8 t0, t1; \
v2u64 out0, out1; \
ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
SRAR_D2_UD(out0, out1, shift); \
t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
t1 = __msa_pckev_b(t0, t0); \
t0 = __msa_pckev_b(t1, t1); \
dst = __msa_copy_s_w((v4i32)t0, 0); \
} while (0)
#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift, \
dst0, dst1, dst2, dst3) do { \
v4u32 tmp0, tmp1, tmp2, tmp3; \
v2u64 out0, out1, out2, out3; \
ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
ILVRL_W2_UW(zero, in1, tmp2, tmp3); \
DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
SRAR_D4_UD(out0, out1, out2, out3, shift); \
PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1); \
ILVRL_W2_UW(zero, in2, tmp0, tmp1); \
ILVRL_W2_UW(zero, in3, tmp2, tmp3); \
DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1); \
DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3); \
SRAR_D4_UD(out0, out1, out2, out3, shift); \
PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3); \
} while (0)
#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do { \
v4u32 tmp0, tmp1; \
v2u64 out0, out1; \
ILVRL_W2_UW(zero, in0, tmp0, tmp1); \
DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1); \
SRAR_D2_UD(out0, out1, shift); \
dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0); \
} while (0)
#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift, \
dst0, dst1) do { \
v4u32 tmp0, tmp1, tmp2, tmp3; \
v2u64 out0, out1, out2, out3; \
ILVRL_W2_UW(in0, in2, tmp0, tmp1); \
ILVRL_W2_UW(in1, in3, tmp2, tmp3); \
DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3); \
SRAR_D4_UD(out0, out1, out2, out3, shift); \
DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
DOTP_UW2_UD(out2, out3, scale, scale, out2, out3); \
SRAR_D4_UD(out0, out1, out2, out3, shift); \
PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1); \
} while (0)
#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do { \
v4u32 tmp0, tmp1; \
v2u64 out0, out1; \
v16i8 t0, t1; \
ILVRL_W2_UW(in0, in1, tmp0, tmp1); \
DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1); \
SRAR_D2_UD(out0, out1, shift); \
DOTP_UW2_UD(out0, out1, scale, scale, out0, out1); \
SRAR_D2_UD(out0, out1, shift); \
t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0); \
t1 = __msa_pckev_b(t0, t0); \
t0 = __msa_pckev_b(t1, t1); \
dst = __msa_copy_s_w((v4i32)t0, 0); \
} while (0)
static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
int length,
WebPRescaler* const wrk) {
const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
const v4i32 zero = { 0 };
while (length >= 16) {
v4u32 src0, src1, src2, src3;
v16u8 out;
LD_UW4(frow, 4, src0, src1, src2, src3);
CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
ST_UB(out, dst);
length -= 16;
frow += 16;
dst += 16;
}
if (length > 0) {
int x_out;
if (length >= 12) {
uint32_t val0_m, val1_m, val2_m;
v4u32 src0, src1, src2;
LD_UW3(frow, 4, src0, src1, src2);
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
CALC_MULT_FIX_4(src1, scale, shift, val1_m);
CALC_MULT_FIX_4(src2, scale, shift, val2_m);
SW3(val0_m, val1_m, val2_m, dst, 4);
length -= 12;
frow += 12;
dst += 12;
} else if (length >= 8) {
uint32_t val0_m, val1_m;
v4u32 src0, src1;
LD_UW2(frow, 4, src0, src1);
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
CALC_MULT_FIX_4(src1, scale, shift, val1_m);
SW2(val0_m, val1_m, dst, 4);
length -= 8;
frow += 8;
dst += 8;
} else if (length >= 4) {
uint32_t val0_m;
const v4u32 src0 = LD_UW(frow);
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
SW(val0_m, dst);
length -= 4;
frow += 4;
dst += 4;
}
for (x_out = 0; x_out < length; ++x_out) {
const uint32_t J = frow[x_out];
const int v = (int)MULT_FIX(J, wrk->fy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
}
}
static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
uint8_t* dst, int length,
WebPRescaler* const wrk) {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
const v4i32 B1 = __msa_fill_w(B);
const v4i32 A1 = __msa_fill_w(A);
const v4i32 AB = __msa_ilvr_w(A1, B1);
const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
while (length >= 16) {
v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
v16u8 t0, t1, t2, t3, t4, t5;
LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
ST_UB(t0, dst);
frow += 16;
irow += 16;
dst += 16;
length -= 16;
}
if (length > 0) {
int x_out;
if (length >= 12) {
uint32_t val0_m, val1_m, val2_m;
v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
LD_UW3(frow, 4, frow0, frow1, frow2);
LD_UW3(irow, 4, irow0, irow1, irow2);
CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
SW3(val0_m, val1_m, val2_m, dst, 4);
frow += 12;
irow += 12;
dst += 12;
length -= 12;
} else if (length >= 8) {
uint32_t val0_m, val1_m;
v4u32 frow0, frow1, irow0, irow1;
LD_UW2(frow, 4, frow0, frow1);
LD_UW2(irow, 4, irow0, irow1);
CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
SW2(val0_m, val1_m, dst, 4);
frow += 4;
irow += 4;
dst += 4;
length -= 4;
} else if (length >= 4) {
uint32_t val0_m;
const v4u32 frow0 = LD_UW(frow + 0);
const v4u32 irow0 = LD_UW(irow + 0);
CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
SW(val0_m, dst);
frow += 4;
irow += 4;
dst += 4;
length -= 4;
}
for (x_out = 0; x_out < length; ++x_out) {
const uint64_t I = (uint64_t)A * frow[x_out]
+ (uint64_t)B * irow[x_out];
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX(J, wrk->fy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
}
}
static void RescalerExportRowExpand_MIPSdspR2(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* frow = wrk->frow;
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(wrk->y_expand);
assert(wrk->y_sub != 0);
if (wrk->y_accum == 0) {
ExportRowExpand_0(frow, dst, x_out_max, wrk);
} else {
ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
}
}
#if 0 // disabled for now. TODO(skal): make match the C-code
static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
uint8_t* dst, int length,
const uint32_t yscale,
WebPRescaler* const wrk) {
const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
const v4i32 zero = { 0 };
while (length >= 16) {
v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
v16u8 out;
LD_UW4(frow, 4, src0, src1, src2, src3);
CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
frac0, frac1, frac2, frac3);
LD_UW4(irow, 4, src0, src1, src2, src3);
SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
src0, src1, src2, src3);
CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
ST_UB(out, dst);
ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
frow += 16;
irow += 16;
dst += 16;
length -= 16;
}
if (length > 0) {
int x_out;
if (length >= 12) {
uint32_t val0_m, val1_m, val2_m;
v4u32 src0, src1, src2, frac0, frac1, frac2;
LD_UW3(frow, 4, src0, src1, src2);
CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
LD_UW3(irow, 4, src0, src1, src2);
SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
SW3(val0_m, val1_m, val2_m, dst, 4);
ST_UW3(frac0, frac1, frac2, irow, 4);
frow += 12;
irow += 12;
dst += 12;
length -= 12;
} else if (length >= 8) {
uint32_t val0_m, val1_m;
v4u32 src0, src1, frac0, frac1;
LD_UW2(frow, 4, src0, src1);
CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
LD_UW2(irow, 4, src0, src1);
SUB2(src0, frac0, src1, frac1, src0, src1);
CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
SW2(val0_m, val1_m, dst, 4);
ST_UW2(frac0, frac1, irow, 4);
frow += 8;
irow += 8;
dst += 8;
length -= 8;
} else if (length >= 4) {
uint32_t val0_m;
v4u32 frac0;
v4u32 src0 = LD_UW(frow);
CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
src0 = LD_UW(irow);
src0 = src0 - frac0;
CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
SW(val0_m, dst);
ST_UW(frac0, irow);
frow += 4;
irow += 4;
dst += 4;
length -= 4;
}
for (x_out = 0; x_out < length; ++x_out) {
const uint32_t frac = (uint32_t)MULT_FIX_FLOOR(frow[x_out], yscale);
const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = frac;
}
}
}
static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
int length,
WebPRescaler* const wrk) {
const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
const v4i32 zero = { 0 };
while (length >= 16) {
v4u32 src0, src1, src2, src3;
v16u8 dst0;
LD_UW4(irow, 4, src0, src1, src2, src3);
CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
ST_UB(dst0, dst);
ST_SW4(zero, zero, zero, zero, irow, 4);
length -= 16;
irow += 16;
dst += 16;
}
if (length > 0) {
int x_out;
if (length >= 12) {
uint32_t val0_m, val1_m, val2_m;
v4u32 src0, src1, src2;
LD_UW3(irow, 4, src0, src1, src2);
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
CALC_MULT_FIX_4(src1, scale, shift, val1_m);
CALC_MULT_FIX_4(src2, scale, shift, val2_m);
SW3(val0_m, val1_m, val2_m, dst, 4);
ST_SW3(zero, zero, zero, irow, 4);
length -= 12;
irow += 12;
dst += 12;
} else if (length >= 8) {
uint32_t val0_m, val1_m;
v4u32 src0, src1;
LD_UW2(irow, 4, src0, src1);
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
CALC_MULT_FIX_4(src1, scale, shift, val1_m);
SW2(val0_m, val1_m, dst, 4);
ST_SW2(zero, zero, irow, 4);
length -= 8;
irow += 8;
dst += 8;
} else if (length >= 4) {
uint32_t val0_m;
const v4u32 src0 = LD_UW(irow + 0);
CALC_MULT_FIX_4(src0, scale, shift, val0_m);
SW(val0_m, dst);
ST_SW(zero, irow);
length -= 4;
irow += 4;
dst += 4;
}
for (x_out = 0; x_out < length; ++x_out) {
const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = 0;
}
}
}
static void RescalerExportRowShrink_MIPSdspR2(WebPRescaler* const wrk) {
uint8_t* dst = wrk->dst;
rescaler_t* irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* frow = wrk->frow;
const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
if (yscale) {
ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
} else {
ExportRowShrink_1(irow, dst, x_out_max, wrk);
}
}
#endif // 0
//------------------------------------------------------------------------------
// Entry point
extern void WebPRescalerDspInitMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
WebPRescalerExportRowExpand = RescalerExportRowExpand_MIPSdspR2;
// WebPRescalerExportRowShrink = RescalerExportRowShrink_MIPSdspR2;
}
#else // !WEBP_USE_MSA
WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
#endif // WEBP_USE_MSA

View File

@ -0,0 +1,192 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON version of rescaling functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_NEON) && !defined(WEBP_REDUCE_SIZE)
#include <arm_neon.h>
#include <assert.h>
#include "dsp_neon.h"
#include "utils_rescaler_utils.h"
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
#define MULT_FIX_FLOOR_C(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
#define LOAD_32x4(SRC, DST) const uint32x4_t DST = vld1q_u32((SRC))
#define LOAD_32x8(SRC, DST0, DST1) \
LOAD_32x4(SRC + 0, DST0); \
LOAD_32x4(SRC + 4, DST1)
#define STORE_32x8(SRC0, SRC1, DST) do { \
vst1q_u32((DST) + 0, SRC0); \
vst1q_u32((DST) + 4, SRC1); \
} while (0);
#if (WEBP_RESCALER_RFIX == 32)
#define MAKE_HALF_CST(C) vdupq_n_s32((int32_t)((C) >> 1))
// note: B is actualy scale>>1. See MAKE_HALF_CST
#define MULT_FIX(A, B) \
vreinterpretq_u32_s32(vqrdmulhq_s32(vreinterpretq_s32_u32((A)), (B)))
#define MULT_FIX_FLOOR(A, B) \
vreinterpretq_u32_s32(vqdmulhq_s32(vreinterpretq_s32_u32((A)), (B)))
#else
#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
#endif
static uint32x4_t Interpolate_NEON(const rescaler_t* const frow,
const rescaler_t* const irow,
uint32_t A, uint32_t B) {
LOAD_32x4(frow, A0);
LOAD_32x4(irow, B0);
const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A);
const uint64x2_t C1 = vmull_n_u32(vget_high_u32(A0), A);
const uint64x2_t D0 = vmlal_n_u32(C0, vget_low_u32(B0), B);
const uint64x2_t D1 = vmlal_n_u32(C1, vget_high_u32(B0), B);
const uint32x4_t E = vcombine_u32(
vrshrn_n_u64(D0, WEBP_RESCALER_RFIX),
vrshrn_n_u64(D1, WEBP_RESCALER_RFIX));
return E;
}
static void RescalerExportRowExpand_NEON(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int max_span = x_out_max & ~7;
const rescaler_t* const frow = wrk->frow;
const uint32_t fy_scale = wrk->fy_scale;
const int32x4_t fy_scale_half = MAKE_HALF_CST(fy_scale);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(wrk->y_expand);
assert(wrk->y_sub != 0);
if (wrk->y_accum == 0) {
for (x_out = 0; x_out < max_span; x_out += 8) {
LOAD_32x4(frow + x_out + 0, A0);
LOAD_32x4(frow + x_out + 4, A1);
const uint32x4_t B0 = MULT_FIX(A0, fy_scale_half);
const uint32x4_t B1 = MULT_FIX(A1, fy_scale_half);
const uint16x4_t C0 = vmovn_u32(B0);
const uint16x4_t C1 = vmovn_u32(B1);
const uint8x8_t D = vqmovn_u16(vcombine_u16(C0, C1));
vst1_u8(dst + x_out, D);
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
const int v = (int)MULT_FIX_C(J, fy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
for (x_out = 0; x_out < max_span; x_out += 8) {
const uint32x4_t C0 =
Interpolate_NEON(frow + x_out + 0, irow + x_out + 0, A, B);
const uint32x4_t C1 =
Interpolate_NEON(frow + x_out + 4, irow + x_out + 4, A, B);
const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half);
const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
const uint16x4_t E0 = vmovn_u32(D0);
const uint16x4_t E1 = vmovn_u32(D1);
const uint8x8_t F = vqmovn_u16(vcombine_u16(E0, E1));
vst1_u8(dst + x_out, F);
}
for (; x_out < x_out_max; ++x_out) {
const uint64_t I = (uint64_t)A * frow[x_out]
+ (uint64_t)B * irow[x_out];
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX_C(J, fy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
}
}
static void RescalerExportRowShrink_NEON(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const int max_span = x_out_max & ~7;
const rescaler_t* const frow = wrk->frow;
const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
const uint32_t fxy_scale = wrk->fxy_scale;
const uint32x4_t zero = vdupq_n_u32(0);
const int32x4_t yscale_half = MAKE_HALF_CST(yscale);
const int32x4_t fxy_scale_half = MAKE_HALF_CST(fxy_scale);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
if (yscale) {
for (x_out = 0; x_out < max_span; x_out += 8) {
LOAD_32x8(frow + x_out, in0, in1);
LOAD_32x8(irow + x_out, in2, in3);
const uint32x4_t A0 = MULT_FIX_FLOOR(in0, yscale_half);
const uint32x4_t A1 = MULT_FIX_FLOOR(in1, yscale_half);
const uint32x4_t B0 = vqsubq_u32(in2, A0);
const uint32x4_t B1 = vqsubq_u32(in3, A1);
const uint32x4_t C0 = MULT_FIX(B0, fxy_scale_half);
const uint32x4_t C1 = MULT_FIX(B1, fxy_scale_half);
const uint16x4_t D0 = vmovn_u32(C0);
const uint16x4_t D1 = vmovn_u32(C1);
const uint8x8_t E = vqmovn_u16(vcombine_u16(D0, D1));
vst1_u8(dst + x_out, E);
STORE_32x8(A0, A1, irow + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t frac = (uint32_t)MULT_FIX_FLOOR_C(frow[x_out], yscale);
const int v = (int)MULT_FIX_C(irow[x_out] - frac, fxy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = frac; // new fractional start
}
} else {
for (x_out = 0; x_out < max_span; x_out += 8) {
LOAD_32x8(irow + x_out, in0, in1);
const uint32x4_t A0 = MULT_FIX(in0, fxy_scale_half);
const uint32x4_t A1 = MULT_FIX(in1, fxy_scale_half);
const uint16x4_t B0 = vmovn_u32(A0);
const uint16x4_t B1 = vmovn_u32(A1);
const uint8x8_t C = vqmovn_u16(vcombine_u16(B0, B1));
vst1_u8(dst + x_out, C);
STORE_32x8(zero, zero, irow + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = 0;
}
}
}
#undef MULT_FIX_FLOOR_C
#undef MULT_FIX_C
#undef MULT_FIX_FLOOR
#undef MULT_FIX
#undef ROUNDER
//------------------------------------------------------------------------------
extern void WebPRescalerDspInitNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) {
WebPRescalerExportRowExpand = RescalerExportRowExpand_NEON;
WebPRescalerExportRowShrink = RescalerExportRowShrink_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(WebPRescalerDspInitNEON)
#endif // WEBP_USE_NEON

View File

@ -0,0 +1,366 @@
// Copyright 2015 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 Rescaling functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE2) && !defined(WEBP_REDUCE_SIZE)
#include <emmintrin.h>
#include <assert.h>
#include "utils_rescaler_utils.h"
#include "utils_utils.h"
//------------------------------------------------------------------------------
// Implementations of critical functions ImportRow / ExportRow
#define ROUNDER (WEBP_RESCALER_ONE >> 1)
#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
#define MULT_FIX_FLOOR(x, y) (((uint64_t)(x) * (y)) >> WEBP_RESCALER_RFIX)
// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0
const __m128i C = _mm_srli_si128(B, 8); // E0F0G0H0
*out = _mm_unpacklo_epi16(B, C);
}
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH
*out = _mm_unpacklo_epi8(A, zero);
}
static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
const uint8_t* src) {
rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
const int x_add = wrk->x_add;
int accum = x_add;
__m128i cur_pixels;
// SSE2 implementation only works with 16b signed arithmetic at max.
if (wrk->src_width < 8 || accum >= (1 << 15)) {
WebPRescalerImportRowExpand_C(wrk, src);
return;
}
assert(!WebPRescalerInputDone(wrk));
assert(wrk->x_expand);
if (wrk->num_channels == 4) {
LoadTwoPixels_SSE2(src, &cur_pixels);
src += 4;
while (1) {
const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
const __m128i out = _mm_madd_epi16(cur_pixels, mult);
_mm_storeu_si128((__m128i*)frow, out);
frow += 4;
if (frow >= frow_end) break;
accum -= wrk->x_sub;
if (accum < 0) {
LoadTwoPixels_SSE2(src, &cur_pixels);
src += 4;
accum += x_add;
}
}
} else {
int left;
const uint8_t* const src_limit = src + wrk->src_width - 8;
LoadEightPixels_SSE2(src, &cur_pixels);
src += 7;
left = 7;
while (1) {
const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum);
const __m128i out = _mm_madd_epi16(cur_pixels, mult);
assert(sizeof(*frow) == sizeof(uint32_t));
WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out));
frow += 1;
if (frow >= frow_end) break;
accum -= wrk->x_sub;
if (accum < 0) {
if (--left) {
cur_pixels = _mm_srli_si128(cur_pixels, 2);
} else if (src <= src_limit) {
LoadEightPixels_SSE2(src, &cur_pixels);
src += 7;
left = 7;
} else { // tail
cur_pixels = _mm_srli_si128(cur_pixels, 2);
cur_pixels = _mm_insert_epi16(cur_pixels, src[1], 1);
src += 1;
left = 1;
}
accum += x_add;
}
}
}
assert(accum == 0);
}
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
const uint8_t* src) {
const int x_sub = wrk->x_sub;
int accum = 0;
const __m128i zero = _mm_setzero_si128();
const __m128i mult0 = _mm_set1_epi16(x_sub);
const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
__m128i sum = zero;
rescaler_t* frow = wrk->frow;
const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
WebPRescalerImportRowShrink_C(wrk, src);
return;
}
assert(!WebPRescalerInputDone(wrk));
assert(!wrk->x_expand);
for (; frow < frow_end; frow += 4) {
__m128i base = zero;
accum += wrk->x_add;
while (accum > 0) {
const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
src += 4;
base = _mm_unpacklo_epi8(A, zero);
// To avoid overflow, we need: base * x_add / x_sub < 32768
// => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
sum = _mm_add_epi16(sum, base);
accum -= x_sub;
}
{ // Emit next horizontal pixel.
const __m128i mult = _mm_set1_epi16(-accum);
const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b
const __m128i frac1 = _mm_mulhi_epu16(base, mult);
const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b
const __m128i A0 = _mm_mullo_epi16(sum, mult0);
const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub
const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac
const __m128i D0 = _mm_srli_epi64(frac, 32);
const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b
const __m128i D2 = _mm_mul_epu32(D0, mult1);
const __m128i E1 = _mm_add_epi64(D1, rounder);
const __m128i E2 = _mm_add_epi64(D2, rounder);
const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
const __m128i G = _mm_unpacklo_epi32(F1, F2);
sum = _mm_packs_epi32(G, zero);
_mm_storeu_si128((__m128i*)frow, frow_out);
}
}
assert(accum == 0);
}
//------------------------------------------------------------------------------
// Row export
// load *src as epi64, multiply by mult and store result in [out0 ... out3]
static WEBP_INLINE void LoadDispatchAndMult_SSE2(const rescaler_t* const src,
const __m128i* const mult,
__m128i* const out0,
__m128i* const out1,
__m128i* const out2,
__m128i* const out3) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
const __m128i A2 = _mm_srli_epi64(A0, 32);
const __m128i A3 = _mm_srli_epi64(A1, 32);
if (mult != NULL) {
*out0 = _mm_mul_epu32(A0, *mult);
*out1 = _mm_mul_epu32(A1, *mult);
*out2 = _mm_mul_epu32(A2, *mult);
*out3 = _mm_mul_epu32(A3, *mult);
} else {
*out0 = A0;
*out1 = A1;
*out2 = A2;
*out3 = A3;
}
}
static WEBP_INLINE void ProcessRow_SSE2(const __m128i* const A0,
const __m128i* const A1,
const __m128i* const A2,
const __m128i* const A3,
const __m128i* const mult,
uint8_t* const dst) {
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
const __m128i B0 = _mm_mul_epu32(*A0, *mult);
const __m128i B1 = _mm_mul_epu32(*A1, *mult);
const __m128i B2 = _mm_mul_epu32(*A2, *mult);
const __m128i B3 = _mm_mul_epu32(*A3, *mult);
const __m128i C0 = _mm_add_epi64(B0, rounder);
const __m128i C1 = _mm_add_epi64(B1, rounder);
const __m128i C2 = _mm_add_epi64(B2, rounder);
const __m128i C3 = _mm_add_epi64(B3, rounder);
const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
#if (WEBP_RESCALER_RFIX < 32)
const __m128i D2 =
_mm_and_si128(_mm_slli_epi64(C2, 32 - WEBP_RESCALER_RFIX), mask);
const __m128i D3 =
_mm_and_si128(_mm_slli_epi64(C3, 32 - WEBP_RESCALER_RFIX), mask);
#else
const __m128i D2 = _mm_and_si128(C2, mask);
const __m128i D3 = _mm_and_si128(C3, mask);
#endif
const __m128i E0 = _mm_or_si128(D0, D2);
const __m128i E1 = _mm_or_si128(D1, D3);
const __m128i F = _mm_packs_epi32(E0, E1);
const __m128i G = _mm_packus_epi16(F, F);
_mm_storel_epi64((__m128i*)dst, G);
}
static void RescalerExportRowExpand_SSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* const frow = wrk->frow;
const __m128i mult = _mm_set_epi32(0, wrk->fy_scale, 0, wrk->fy_scale);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0 && wrk->y_sub + wrk->y_accum >= 0);
assert(wrk->y_expand);
if (wrk->y_accum == 0) {
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3;
LoadDispatchAndMult_SSE2(frow + x_out, NULL, &A0, &A1, &A2, &A3);
ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t J = frow[x_out];
const int v = (int)MULT_FIX(J, wrk->fy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
} else {
const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
const __m128i mA = _mm_set_epi32(0, A, 0, A);
const __m128i mB = _mm_set_epi32(0, B, 0, B);
const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
LoadDispatchAndMult_SSE2(frow + x_out, &mA, &A0, &A1, &A2, &A3);
LoadDispatchAndMult_SSE2(irow + x_out, &mB, &B0, &B1, &B2, &B3);
{
const __m128i C0 = _mm_add_epi64(A0, B0);
const __m128i C1 = _mm_add_epi64(A1, B1);
const __m128i C2 = _mm_add_epi64(A2, B2);
const __m128i C3 = _mm_add_epi64(A3, B3);
const __m128i D0 = _mm_add_epi64(C0, rounder);
const __m128i D1 = _mm_add_epi64(C1, rounder);
const __m128i D2 = _mm_add_epi64(C2, rounder);
const __m128i D3 = _mm_add_epi64(C3, rounder);
const __m128i E0 = _mm_srli_epi64(D0, WEBP_RESCALER_RFIX);
const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
const uint64_t I = (uint64_t)A * frow[x_out]
+ (uint64_t)B * irow[x_out];
const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
const int v = (int)MULT_FIX(J, wrk->fy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
}
}
}
static void RescalerExportRowShrink_SSE2(WebPRescaler* const wrk) {
int x_out;
uint8_t* const dst = wrk->dst;
rescaler_t* const irow = wrk->irow;
const int x_out_max = wrk->dst_width * wrk->num_channels;
const rescaler_t* const frow = wrk->frow;
const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
assert(!WebPRescalerOutputDone(wrk));
assert(wrk->y_accum <= 0);
assert(!wrk->y_expand);
if (yscale) {
const int scale_xy = wrk->fxy_scale;
const __m128i mult_xy = _mm_set_epi32(0, scale_xy, 0, scale_xy);
const __m128i mult_y = _mm_set_epi32(0, yscale, 0, yscale);
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3, B0, B1, B2, B3;
LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
LoadDispatchAndMult_SSE2(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
{
const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX); // = frac
const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX);
const __m128i D2 = _mm_srli_epi64(B2, WEBP_RESCALER_RFIX);
const __m128i D3 = _mm_srli_epi64(B3, WEBP_RESCALER_RFIX);
const __m128i E0 = _mm_sub_epi64(A0, D0); // irow[x] - frac
const __m128i E1 = _mm_sub_epi64(A1, D1);
const __m128i E2 = _mm_sub_epi64(A2, D2);
const __m128i E3 = _mm_sub_epi64(A3, D3);
const __m128i F2 = _mm_slli_epi64(D2, 32);
const __m128i F3 = _mm_slli_epi64(D3, 32);
const __m128i G0 = _mm_or_si128(D0, F2);
const __m128i G1 = _mm_or_si128(D1, F3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
ProcessRow_SSE2(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
}
}
for (; x_out < x_out_max; ++x_out) {
const uint32_t frac = (int)MULT_FIX_FLOOR(frow[x_out], yscale);
const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = frac; // new fractional start
}
} else {
const uint32_t scale = wrk->fxy_scale;
const __m128i mult = _mm_set_epi32(0, scale, 0, scale);
const __m128i zero = _mm_setzero_si128();
for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
__m128i A0, A1, A2, A3;
LoadDispatchAndMult_SSE2(irow + x_out, NULL, &A0, &A1, &A2, &A3);
_mm_storeu_si128((__m128i*)(irow + x_out + 0), zero);
_mm_storeu_si128((__m128i*)(irow + x_out + 4), zero);
ProcessRow_SSE2(&A0, &A1, &A2, &A3, &mult, dst + x_out);
}
for (; x_out < x_out_max; ++x_out) {
const int v = (int)MULT_FIX(irow[x_out], scale);
dst[x_out] = (v > 255) ? 255u : (uint8_t)v;
irow[x_out] = 0;
}
}
}
#undef MULT_FIX_FLOOR
#undef MULT_FIX
#undef ROUNDER
//------------------------------------------------------------------------------
extern void WebPRescalerDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
WebPRescalerImportRowExpand = RescalerImportRowExpand_SSE2;
WebPRescalerImportRowShrink = RescalerImportRowShrink_SSE2;
WebPRescalerExportRowExpand = RescalerExportRowExpand_SSE2;
WebPRescalerExportRowShrink = RescalerExportRowShrink_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(WebPRescalerDspInitSSE2)
#endif // WEBP_USE_SSE2

159
vendor/github.com/sizeofint/webpanimation/dsp_ssim.c generated vendored Normal file
View File

@ -0,0 +1,159 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// distortion calculation
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h> // for abs()
#include "dsp_dsp.h"
#if !defined(WEBP_REDUCE_SIZE)
//------------------------------------------------------------------------------
// SSIM / PSNR
// hat-shaped filter. Sum of coefficients is equal to 16.
static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
1, 2, 3, 4, 3, 2, 1
};
static const uint32_t kWeightSum = 16 * 16; // sum{kWeight}^2
static WEBP_INLINE double SSIMCalculation(
const VP8DistoStats* const stats, uint32_t N /*num samples*/) {
const uint32_t w2 = N * N;
const uint32_t C1 = 20 * w2;
const uint32_t C2 = 60 * w2;
const uint32_t C3 = 8 * 8 * w2; // 'dark' limit ~= 6
const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
if (xmxm + ymym >= C3) {
const int64_t xmym = (int64_t)stats->xm * stats->ym;
const int64_t sxy = (int64_t)stats->xym * N - xmym; // can be negative
const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
const uint64_t syy = (uint64_t)stats->yym * N - ymym;
// we descale by 8 to prevent overflow during the fnum/fden multiply.
const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
const uint64_t den_S = (sxx + syy + C2) >> 8;
const uint64_t fnum = (2 * xmym + C1) * num_S;
const uint64_t fden = (xmxm + ymym + C1) * den_S;
const double r = (double)fnum / fden;
assert(r >= 0. && r <= 1.0);
return r;
}
return 1.; // area is too dark to contribute meaningfully
}
double VP8SSIMFromStats(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, kWeightSum);
}
double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
return SSIMCalculation(stats, stats->w);
}
static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2,
int xo, int yo, int W, int H) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
: yo + VP8_SSIM_KERNEL;
const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
: xo + VP8_SSIM_KERNEL;
int x, y;
src1 += ymin * stride1;
src2 += ymin * stride2;
for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
for (x = xmin; x <= xmax; ++x) {
const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
* kWeight[VP8_SSIM_KERNEL + y - yo];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.w += w;
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStatsClipped(&stats);
}
static double SSIMGet_C(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
int x, y;
for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
const uint32_t w = kWeight[x] * kWeight[y];
const uint32_t s1 = src1[x];
const uint32_t s2 = src2[x];
stats.xm += w * s1;
stats.ym += w * s2;
stats.xxm += w * s1 * s1;
stats.xym += w * s1 * s2;
stats.yym += w * s2 * s2;
}
}
return VP8SSIMFromStats(&stats);
}
#endif // !defined(WEBP_REDUCE_SIZE)
//------------------------------------------------------------------------------
#if !defined(WEBP_DISABLE_STATS)
static uint32_t AccumulateSSE_C(const uint8_t* src1,
const uint8_t* src2, int len) {
int i;
uint32_t sse2 = 0;
assert(len <= 65535); // to ensure that accumulation fits within uint32_t
for (i = 0; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
#endif
//------------------------------------------------------------------------------
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGetFunc VP8SSIMGet;
VP8SSIMGetClippedFunc VP8SSIMGetClipped;
#endif
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSEFunc VP8AccumulateSSE;
#endif
extern void VP8SSIMDspInitSSE2(void);
WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGetClipped = SSIMGetClipped_C;
VP8SSIMGet = SSIMGet_C;
#endif
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSE = AccumulateSSE_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
VP8SSIMDspInitSSE2();
}
#endif
}
}

View File

@ -0,0 +1,165 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of distortion calculation
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include "dsp_common_sse2.h"
#if !defined(WEBP_DISABLE_STATS)
// Helper function
static WEBP_INLINE void SubtractAndSquare_SSE2(const __m128i a, const __m128i b,
__m128i* const sum) {
// take abs(a-b) in 8b
const __m128i a_b = _mm_subs_epu8(a, b);
const __m128i b_a = _mm_subs_epu8(b, a);
const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
// zero-extend to 16b
const __m128i zero = _mm_setzero_si128();
const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
// multiply with self
const __m128i sum1 = _mm_madd_epi16(C0, C0);
const __m128i sum2 = _mm_madd_epi16(C1, C1);
*sum = _mm_add_epi32(sum1, sum2);
}
//------------------------------------------------------------------------------
// SSIM / PSNR entry point
static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
const uint8_t* src2, int len) {
int i = 0;
uint32_t sse2 = 0;
if (len >= 16) {
const int limit = len - 32;
int32_t tmp[4];
__m128i sum1;
__m128i sum = _mm_setzero_si128();
__m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
__m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
while (i <= limit) {
const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
__m128i sum2;
i += 16;
SubtractAndSquare_SSE2(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
i += 16;
SubtractAndSquare_SSE2(a1, b1, &sum2);
sum = _mm_add_epi32(sum, sum2);
}
SubtractAndSquare_SSE2(a0, b0, &sum1);
sum = _mm_add_epi32(sum, sum1);
_mm_storeu_si128((__m128i*)tmp, sum);
sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
for (; i < len; ++i) {
const int32_t diff = src1[i] - src2[i];
sse2 += diff * diff;
}
return sse2;
}
#endif // !defined(WEBP_DISABLE_STATS)
#if !defined(WEBP_REDUCE_SIZE)
static uint32_t HorizontalAdd16b_SSE2(const __m128i* const m) {
uint16_t tmp[8];
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi16(*m, a);
_mm_storeu_si128((__m128i*)tmp, b);
return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
}
static uint32_t HorizontalAdd32b_SSE2(const __m128i* const m) {
const __m128i a = _mm_srli_si128(*m, 8);
const __m128i b = _mm_add_epi32(*m, a);
const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
return (uint32_t)_mm_cvtsi128_si32(c);
}
static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
#define ACCUMULATE_ROW(WEIGHT) do { \
/* compute row weight (Wx * Wy) */ \
const __m128i Wy = _mm_set1_epi16((WEIGHT)); \
const __m128i W = _mm_mullo_epi16(Wx, Wy); \
/* process 8 bytes at a time (7 bytes, actually) */ \
const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
/* convert to 16b and multiply by weight */ \
const __m128i a1 = _mm_unpacklo_epi8(a0, zero); \
const __m128i b1 = _mm_unpacklo_epi8(b0, zero); \
const __m128i wa1 = _mm_mullo_epi16(a1, W); \
const __m128i wb1 = _mm_mullo_epi16(b1, W); \
/* accumulate */ \
xm = _mm_add_epi16(xm, wa1); \
ym = _mm_add_epi16(ym, wb1); \
xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1)); \
xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1)); \
yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1)); \
src1 += stride1; \
src2 += stride2; \
} while (0)
static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
const uint8_t* src2, int stride2) {
VP8DistoStats stats;
const __m128i zero = _mm_setzero_si128();
__m128i xm = zero, ym = zero; // 16b accums
__m128i xxm = zero, yym = zero, xym = zero; // 32b accum
const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
assert(2 * VP8_SSIM_KERNEL + 1 == 7);
ACCUMULATE_ROW(1);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(4);
ACCUMULATE_ROW(3);
ACCUMULATE_ROW(2);
ACCUMULATE_ROW(1);
stats.xm = HorizontalAdd16b_SSE2(&xm);
stats.ym = HorizontalAdd16b_SSE2(&ym);
stats.xxm = HorizontalAdd32b_SSE2(&xxm);
stats.xym = HorizontalAdd32b_SSE2(&xym);
stats.yym = HorizontalAdd32b_SSE2(&yym);
return VP8SSIMFromStats(&stats);
}
#endif // !defined(WEBP_REDUCE_SIZE)
extern void VP8SSIMDspInitSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
#if !defined(WEBP_DISABLE_STATS)
VP8AccumulateSSE = AccumulateSSE_SSE2;
#endif
#if !defined(WEBP_REDUCE_SIZE)
VP8SSIMGet = SSIMGet_SSE2;
#endif
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
#endif // WEBP_USE_SSE2

View File

@ -0,0 +1,327 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV to RGB upsampling functions.
//
// Author: somnath@google.com (Somnath Banerjee)
#include "dsp_dsp.h"
#include "dsp_yuv.h"
#include <assert.h>
//------------------------------------------------------------------------------
// Fancy upsampler
#ifdef FANCY_UPSAMPLING
// Fancy upsampling functions to convert YUV to RGB
WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST];
// Given samples laid out in a square as:
// [a b]
// [c d]
// we interpolate u/v as:
// ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16
// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
// We process u and v together stashed into 32bit (16bit each).
#define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int x; \
const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \
assert(top_y != NULL); \
{ \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst); \
} \
for (x = 1; x <= last_pixel_pair; ++x) { \
const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \
const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \
/* precompute invariant values associated with first and second diagonals*/\
const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u; \
const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3; \
const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3; \
{ \
const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (2 * x - 1) * (XSTEP)); \
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
top_dst + (2 * x - 0) * (XSTEP)); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
const uint32_t uv1 = (diag_12 + uv) >> 1; \
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (2 * x - 1) * (XSTEP)); \
FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
bottom_dst + (2 * x + 0) * (XSTEP)); \
} \
tl_uv = t_uv; \
l_uv = uv; \
} \
if (!(len & 1)) { \
{ \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (len - 1) * (XSTEP)); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (len - 1) * (XSTEP)); \
} \
} \
}
// All variants implemented.
#if !WEBP_NEON_OMIT_C_CODE
UPSAMPLE_FUNC(UpsampleRgbaLinePair_C, VP8YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair_C, VP8YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
UPSAMPLE_FUNC(UpsampleArgbLinePair_C, VP8YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgbLinePair_C, VP8YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair_C, VP8YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair_C, VP8YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair_C, VP8YuvToRgb565, 2)
#else
static void EmptyUpsampleFunc(const uint8_t* top_y, const uint8_t* bottom_y,
const uint8_t* top_u, const uint8_t* top_v,
const uint8_t* cur_u, const uint8_t* cur_v,
uint8_t* top_dst, uint8_t* bottom_dst, int len) {
(void)top_y;
(void)bottom_y;
(void)top_u;
(void)top_v;
(void)cur_u;
(void)cur_v;
(void)top_dst;
(void)bottom_dst;
(void)len;
assert(0); // COLORSPACE SUPPORT NOT COMPILED
}
#define UpsampleArgbLinePair_C EmptyUpsampleFunc
#define UpsampleRgbLinePair_C EmptyUpsampleFunc
#define UpsampleBgrLinePair_C EmptyUpsampleFunc
#define UpsampleRgba4444LinePair_C EmptyUpsampleFunc
#define UpsampleRgb565LinePair_C EmptyUpsampleFunc
#endif // WEBP_REDUCE_CSP
#endif
#undef LOAD_UV
#undef UPSAMPLE_FUNC
#endif // FANCY_UPSAMPLING
//------------------------------------------------------------------------------
#if !defined(FANCY_UPSAMPLING)
#define DUAL_SAMPLE_FUNC(FUNC_NAME, FUNC) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* bot_u, const uint8_t* bot_v, \
uint8_t* top_dst, uint8_t* bot_dst, int len) { \
const int half_len = len >> 1; \
int x; \
assert(top_dst != NULL); \
{ \
for (x = 0; x < half_len; ++x) { \
FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0); \
FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4); \
} \
if (len & 1) FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x); \
} \
if (bot_dst != NULL) { \
for (x = 0; x < half_len; ++x) { \
FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x + 0); \
FUNC(bot_y[2 * x + 1], bot_u[x], bot_v[x], bot_dst + 8 * x + 4); \
} \
if (len & 1) FUNC(bot_y[2 * x + 0], bot_u[x], bot_v[x], bot_dst + 8 * x); \
} \
}
DUAL_SAMPLE_FUNC(DualLineSamplerBGRA, VP8YuvToBgra)
DUAL_SAMPLE_FUNC(DualLineSamplerARGB, VP8YuvToArgb)
#undef DUAL_SAMPLE_FUNC
#endif // !FANCY_UPSAMPLING
WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
WebPInitUpsamplers();
#ifdef FANCY_UPSAMPLING
return WebPUpsamplers[alpha_is_last ? MODE_BGRA : MODE_ARGB];
#else
return (alpha_is_last ? DualLineSamplerBGRA : DualLineSamplerARGB);
#endif
}
//------------------------------------------------------------------------------
// YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * (XSTEP)]); \
}
YUV444_FUNC(WebPYuv444ToRgba_C, VP8YuvToRgba, 4)
YUV444_FUNC(WebPYuv444ToBgra_C, VP8YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(WebPYuv444ToRgb_C, VP8YuvToRgb, 3)
YUV444_FUNC(WebPYuv444ToBgr_C, VP8YuvToBgr, 3)
YUV444_FUNC(WebPYuv444ToArgb_C, VP8YuvToArgb, 4)
YUV444_FUNC(WebPYuv444ToRgba4444_C, VP8YuvToRgba4444, 2)
YUV444_FUNC(WebPYuv444ToRgb565_C, VP8YuvToRgb565, 2)
#else
static void EmptyYuv444Func(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
(void)y;
(void)u;
(void)v;
(void)dst;
(void)len;
}
#define WebPYuv444ToRgb_C EmptyYuv444Func
#define WebPYuv444ToBgr_C EmptyYuv444Func
#define WebPYuv444ToArgb_C EmptyYuv444Func
#define WebPYuv444ToRgba4444_C EmptyYuv444Func
#define WebPYuv444ToRgb565_C EmptyYuv444Func
#endif // WEBP_REDUCE_CSP
#undef YUV444_FUNC
WebPYUV444Converter WebPYUV444Converters[MODE_LAST];
extern void WebPInitYUV444ConvertersMIPSdspR2(void);
extern void WebPInitYUV444ConvertersSSE2(void);
extern void WebPInitYUV444ConvertersSSE41(void);
WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
WebPYUV444Converters[MODE_RGBA] = WebPYuv444ToRgba_C;
WebPYUV444Converters[MODE_BGRA] = WebPYuv444ToBgra_C;
WebPYUV444Converters[MODE_RGB] = WebPYuv444ToRgb_C;
WebPYUV444Converters[MODE_BGR] = WebPYuv444ToBgr_C;
WebPYUV444Converters[MODE_ARGB] = WebPYuv444ToArgb_C;
WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444_C;
WebPYUV444Converters[MODE_RGB_565] = WebPYuv444ToRgb565_C;
WebPYUV444Converters[MODE_rgbA] = WebPYuv444ToRgba_C;
WebPYUV444Converters[MODE_bgrA] = WebPYuv444ToBgra_C;
WebPYUV444Converters[MODE_Argb] = WebPYuv444ToArgb_C;
WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444_C;
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitYUV444ConvertersSSE2();
}
#endif
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitYUV444ConvertersSSE41();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitYUV444ConvertersMIPSdspR2();
}
#endif
}
}
//------------------------------------------------------------------------------
// Main calls
extern void WebPInitUpsamplersSSE2(void);
extern void WebPInitUpsamplersSSE41(void);
extern void WebPInitUpsamplersNEON(void);
extern void WebPInitUpsamplersMIPSdspR2(void);
extern void WebPInitUpsamplersMSA(void);
WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
#ifdef FANCY_UPSAMPLING
#if !WEBP_NEON_OMIT_C_CODE
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_C;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_C;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_C;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_C;
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_C;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_C;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_C;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_C;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_C;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_C;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_C;
#endif
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitUpsamplersSSE2();
}
#endif
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitUpsamplersSSE41();
}
#endif
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitUpsamplersMIPSdspR2();
}
#endif
#if defined(WEBP_USE_MSA)
if (VP8GetCPUInfo(kMSA)) {
WebPInitUpsamplersMSA();
}
#endif
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitUpsamplersNEON();
}
#endif
assert(WebPUpsamplers[MODE_RGBA] != NULL);
assert(WebPUpsamplers[MODE_BGRA] != NULL);
assert(WebPUpsamplers[MODE_rgbA] != NULL);
assert(WebPUpsamplers[MODE_bgrA] != NULL);
#if !defined(WEBP_REDUCE_CSP) || !WEBP_NEON_OMIT_C_CODE
assert(WebPUpsamplers[MODE_RGB] != NULL);
assert(WebPUpsamplers[MODE_BGR] != NULL);
assert(WebPUpsamplers[MODE_ARGB] != NULL);
assert(WebPUpsamplers[MODE_RGBA_4444] != NULL);
assert(WebPUpsamplers[MODE_RGB_565] != NULL);
assert(WebPUpsamplers[MODE_Argb] != NULL);
assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
#endif
#endif // FANCY_UPSAMPLING
}
//------------------------------------------------------------------------------

View File

@ -0,0 +1,291 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV to RGB upsampling functions.
//
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include <assert.h>
#include "dsp_yuv.h"
#define YUV_TO_RGB(Y, U, V, R, G, B) do { \
const int t1 = MultHi(Y, 19077); \
const int t2 = MultHi(V, 13320); \
R = MultHi(V, 26149); \
G = MultHi(U, 6419); \
B = MultHi(U, 33050); \
R = t1 + R; \
G = t1 - G; \
B = t1 + B; \
R = R - 14234; \
G = G - t2 + 8708; \
B = B - 17685; \
__asm__ volatile ( \
"shll_s.w %[" #R "], %[" #R "], 17 \n\t" \
"shll_s.w %[" #G "], %[" #G "], 17 \n\t" \
"shll_s.w %[" #B "], %[" #B "], 17 \n\t" \
"precrqu_s.qb.ph %[" #R "], %[" #R "], $zero \n\t" \
"precrqu_s.qb.ph %[" #G "], %[" #G "], $zero \n\t" \
"precrqu_s.qb.ph %[" #B "], %[" #B "], $zero \n\t" \
"srl %[" #R "], %[" #R "], 24 \n\t" \
"srl %[" #G "], %[" #G "], 24 \n\t" \
"srl %[" #B "], %[" #B "], 24 \n\t" \
: [R]"+r"(R), [G]"+r"(G), [B]"+r"(B) \
: \
); \
} while (0)
#if !defined(WEBP_REDUCE_CSP)
static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
rgb[0] = r;
rgb[1] = g;
rgb[2] = b;
}
static WEBP_INLINE void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
bgr[0] = b;
bgr[1] = g;
bgr[2] = r;
}
static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
{
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
rgb[0] = rg;
rgb[1] = gb;
#endif
}
}
static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
uint8_t* const argb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
{
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
argb[0] = rg;
argb[1] = ba;
#endif
}
}
#endif // WEBP_REDUCE_CSP
//-----------------------------------------------------------------------------
// Alpha handling variants
#if !defined(WEBP_REDUCE_CSP)
static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const argb) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
argb[0] = 0xff;
argb[1] = r;
argb[2] = g;
argb[3] = b;
}
#endif // WEBP_REDUCE_CSP
static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const bgra) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
bgra[0] = b;
bgra[1] = g;
bgra[2] = r;
bgra[3] = 0xff;
}
static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const rgba) {
int r, g, b;
YUV_TO_RGB(y, u, v, r, g, b);
rgba[0] = r;
rgba[1] = g;
rgba[2] = b;
rgba[3] = 0xff;
}
//------------------------------------------------------------------------------
// Fancy upsampler
#ifdef FANCY_UPSAMPLING
// Given samples laid out in a square as:
// [a b]
// [c d]
// we interpolate u/v as:
// ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16
// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16
// We process u and v together stashed into 32bit (16bit each).
#define LOAD_UV(u, v) ((u) | ((v) << 16))
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int x; \
const int last_pixel_pair = (len - 1) >> 1; \
uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \
uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \
assert(top_y != NULL); \
{ \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst); \
} \
for (x = 1; x <= last_pixel_pair; ++x) { \
const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \
const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \
/* precompute invariant values associated with first and second diagonals*/\
const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u; \
const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3; \
const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3; \
{ \
const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \
const uint32_t uv1 = (diag_03 + t_uv) >> 1; \
FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (2 * x - 1) * XSTEP); \
FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \
top_dst + (2 * x - 0) * XSTEP); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (diag_03 + l_uv) >> 1; \
const uint32_t uv1 = (diag_12 + uv) >> 1; \
FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (2 * x - 1) * XSTEP); \
FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \
bottom_dst + (2 * x + 0) * XSTEP); \
} \
tl_uv = t_uv; \
l_uv = uv; \
} \
if (!(len & 1)) { \
{ \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
top_dst + (len - 1) * XSTEP); \
} \
if (bottom_y != NULL) { \
const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \
bottom_dst + (len - 1) * XSTEP); \
} \
} \
}
// All variants implemented.
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
#undef LOAD_UV
#undef UPSAMPLE_FUNC
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitUpsamplersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
//------------------------------------------------------------------------------
// YUV444 converter
#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \
}
YUV444_FUNC(Yuv444ToRgba, YuvToRgba, 4)
YUV444_FUNC(Yuv444ToBgra, YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(Yuv444ToRgb, YuvToRgb, 3)
YUV444_FUNC(Yuv444ToBgr, YuvToBgr, 3)
YUV444_FUNC(Yuv444ToArgb, YuvToArgb, 4)
YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
YUV444_FUNC(Yuv444ToRgb565, YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
#undef YUV444_FUNC
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitYUV444ConvertersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba;
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra;
#if !defined(WEBP_REDUCE_CSP)
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr;
WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb;
WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565;
WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb;
WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
#endif // WEBP_REDUCE_CSP
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2
#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MIPS_DSP_R2))
WEBP_DSP_INIT_STUB(WebPInitUpsamplersMIPSdspR2)
#endif

View File

@ -0,0 +1,688 @@
// Copyright 2016 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MSA version of YUV to RGB upsampling functions.
//
// Author: Prashant Patil (prashant.patil@imgtec.com)
#include <string.h>
#include "dsp_dsp.h"
#if defined(WEBP_USE_MSA)
#include "dsp_msa_macro.h"
#include "dsp_yuv.h"
#ifdef FANCY_UPSAMPLING
#define ILVR_UW2(in, out0, out1) do { \
const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in); \
out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0); \
out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0); \
} while (0)
#define ILVRL_UW4(in, out0, out1, out2, out3) do { \
v16u8 t0, t1; \
ILVRL_B2_UB(zero, in, t0, t1); \
ILVRL_H2_UW(zero, t0, out0, out1); \
ILVRL_H2_UW(zero, t1, out2, out3); \
} while (0)
#define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do { \
const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256); \
v4u32 temp0, temp1, temp2, temp3; \
MUL4(in0, const0, in1, const0, in2, const0, in3, const0, \
temp0, temp1, temp2, temp3); \
PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1); \
} while (0)
#define MULTHI_8(in0, in1, cnst, out0) do { \
const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256); \
v4u32 temp0, temp1; \
MUL2(in0, const0, in1, const0, temp0, temp1); \
out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0); \
} while (0)
#define CALC_R16(y0, y1, v0, v1, dst) do { \
const v8i16 const_a = (v8i16)__msa_fill_h(14234); \
const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0); \
const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1); \
v8i16 b0 = __msa_subs_s_h(a0, const_a); \
v8i16 b1 = __msa_subs_s_h(a1, const_a); \
SRAI_H2_SH(b0, b1, 6); \
CLIP_SH2_0_255(b0, b1); \
dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0); \
} while (0)
#define CALC_R8(y0, v0, dst) do { \
const v8i16 const_a = (v8i16)__msa_fill_h(14234); \
const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0); \
v8i16 b0 = __msa_subs_s_h(a0, const_a); \
b0 = SRAI_H(b0, 6); \
CLIP_SH_0_255(b0); \
dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0); \
} while (0)
#define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do { \
const v8i16 const_a = (v8i16)__msa_fill_h(8708); \
v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0); \
v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1); \
const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0); \
const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1); \
a0 = __msa_adds_s_h(b0, const_a); \
a1 = __msa_adds_s_h(b1, const_a); \
SRAI_H2_SH(a0, a1, 6); \
CLIP_SH2_0_255(a0, a1); \
dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0); \
} while (0)
#define CALC_G8(y0, u0, v0, dst) do { \
const v8i16 const_a = (v8i16)__msa_fill_h(8708); \
v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0); \
const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0); \
a0 = __msa_adds_s_h(b0, const_a); \
a0 = SRAI_H(a0, 6); \
CLIP_SH_0_255(a0); \
dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0); \
} while (0)
#define CALC_B16(y0, y1, u0, u1, dst) do { \
const v8u16 const_a = (v8u16)__msa_fill_h(17685); \
const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0); \
const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1); \
v8u16 b0 = __msa_subs_u_h(a0, const_a); \
v8u16 b1 = __msa_subs_u_h(a1, const_a); \
SRAI_H2_UH(b0, b1, 6); \
CLIP_UH2_0_255(b0, b1); \
dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0); \
} while (0)
#define CALC_B8(y0, u0, dst) do { \
const v8u16 const_a = (v8u16)__msa_fill_h(17685); \
const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0); \
v8u16 b0 = __msa_subs_u_h(a0, const_a); \
b0 = SRAI_H(b0, 6); \
CLIP_UH_0_255(b0); \
dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0); \
} while (0)
#define CALC_RGB16(y, u, v, R, G, B) do { \
const v16u8 zero = { 0 }; \
v8u16 y0, y1, u0, u1, v0, v1; \
v4u32 p0, p1, p2, p3; \
const v16u8 in_y = LD_UB(y); \
const v16u8 in_u = LD_UB(u); \
const v16u8 in_v = LD_UB(v); \
ILVRL_UW4(in_y, p0, p1, p2, p3); \
MULTHI_16(p0, p1, p2, p3, 19077, y0, y1); \
ILVRL_UW4(in_v, p0, p1, p2, p3); \
MULTHI_16(p0, p1, p2, p3, 26149, v0, v1); \
CALC_R16(y0, y1, v0, v1, R); \
MULTHI_16(p0, p1, p2, p3, 13320, v0, v1); \
ILVRL_UW4(in_u, p0, p1, p2, p3); \
MULTHI_16(p0, p1, p2, p3, 6419, u0, u1); \
CALC_G16(y0, y1, u0, u1, v0, v1, G); \
MULTHI_16(p0, p1, p2, p3, 33050, u0, u1); \
CALC_B16(y0, y1, u0, u1, B); \
} while (0)
#define CALC_RGB8(y, u, v, R, G, B) do { \
const v16u8 zero = { 0 }; \
v8u16 y0, u0, v0; \
v4u32 p0, p1; \
const v16u8 in_y = LD_UB(y); \
const v16u8 in_u = LD_UB(u); \
const v16u8 in_v = LD_UB(v); \
ILVR_UW2(in_y, p0, p1); \
MULTHI_8(p0, p1, 19077, y0); \
ILVR_UW2(in_v, p0, p1); \
MULTHI_8(p0, p1, 26149, v0); \
CALC_R8(y0, v0, R); \
MULTHI_8(p0, p1, 13320, v0); \
ILVR_UW2(in_u, p0, p1); \
MULTHI_8(p0, p1, 6419, u0); \
CALC_G8(y0, u0, v0, G); \
MULTHI_8(p0, p1, 33050, u0); \
CALC_B8(y0, u0, B); \
} while (0)
#define STORE16_3(a0, a1, a2, dst) do { \
const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, \
8, 9, 20, 10 }; \
const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7, \
8, 25, 9, 10 }; \
const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7, \
30, 8, 9, 31 }; \
v16u8 out0, out1, out2, tmp0, tmp1, tmp2; \
ILVRL_B2_UB(a1, a0, tmp0, tmp1); \
out0 = VSHF_UB(tmp0, a2, mask0); \
tmp2 = SLDI_UB(tmp1, tmp0, 11); \
out1 = VSHF_UB(tmp2, a2, mask1); \
tmp2 = SLDI_UB(tmp1, tmp1, 6); \
out2 = VSHF_UB(tmp2, a2, mask2); \
ST_UB(out0, dst + 0); \
ST_UB(out1, dst + 16); \
ST_UB(out2, dst + 32); \
} while (0)
#define STORE8_3(a0, a1, a2, dst) do { \
int64_t out_m; \
const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19, \
8, 9, 20, 10 }; \
const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23, \
255, 255, 255, 255, 255, 255, 255, 255 }; \
const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0); \
v16u8 out0, out1; \
VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1); \
ST_UB(out0, dst); \
out_m = __msa_copy_s_d((v2i64)out1, 0); \
SD(out_m, dst + 16); \
} while (0)
#define STORE16_4(a0, a1, a2, a3, dst) do { \
v16u8 tmp0, tmp1, tmp2, tmp3; \
v16u8 out0, out1, out2, out3; \
ILVRL_B2_UB(a1, a0, tmp0, tmp1); \
ILVRL_B2_UB(a3, a2, tmp2, tmp3); \
ILVRL_H2_UB(tmp2, tmp0, out0, out1); \
ILVRL_H2_UB(tmp3, tmp1, out2, out3); \
ST_UB(out0, dst + 0); \
ST_UB(out1, dst + 16); \
ST_UB(out2, dst + 32); \
ST_UB(out3, dst + 48); \
} while (0)
#define STORE8_4(a0, a1, a2, a3, dst) do { \
v16u8 tmp0, tmp1, tmp2, tmp3; \
ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1); \
ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3); \
ST_UB(tmp2, dst + 0); \
ST_UB(tmp3, dst + 16); \
} while (0)
#define STORE2_16(a0, a1, dst) do { \
v16u8 out0, out1; \
ILVRL_B2_UB(a1, a0, out0, out1); \
ST_UB(out0, dst + 0); \
ST_UB(out1, dst + 16); \
} while (0)
#define STORE2_8(a0, a1, dst) do { \
const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0); \
ST_UB(out0, dst); \
} while (0)
#define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do { \
CALC_RGB##N(y, u, v, R, G, B); \
tmp0 = ANDI_B(R, 0xf0); \
tmp1 = SRAI_B(G, 4); \
RG = tmp0 | tmp1; \
tmp0 = ANDI_B(B, 0xf0); \
BA = ORI_B(tmp0, 0x0f); \
STORE2_##N(out0, out1, dst); \
} while (0)
#define CALC_RGB565(y, u, v, out0, out1, N, dst) do { \
CALC_RGB##N(y, u, v, R, G, B); \
tmp0 = ANDI_B(R, 0xf8); \
tmp1 = SRAI_B(G, 5); \
RG = tmp0 | tmp1; \
tmp0 = SLLI_B(G, 3); \
tmp1 = ANDI_B(tmp0, 0xe0); \
tmp0 = SRAI_B(B, 3); \
GB = tmp0 | tmp1; \
STORE2_##N(out0, out1, dst); \
} while (0)
static WEBP_INLINE int Clip8(int v) {
return v < 0 ? 0 : v > 255 ? 255 : v;
}
static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
const int y1 = MultHi(y, 19077);
const int r1 = y1 + MultHi(v, 26149) - 14234;
const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
const int b1 = y1 + MultHi(u, 33050) - 17685;
rgb[0] = Clip8(r1 >> 6);
rgb[1] = Clip8(g1 >> 6);
rgb[2] = Clip8(b1 >> 6);
}
static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
const int y1 = MultHi(y, 19077);
const int r1 = y1 + MultHi(v, 26149) - 14234;
const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
const int b1 = y1 + MultHi(u, 33050) - 17685;
bgr[0] = Clip8(b1 >> 6);
bgr[1] = Clip8(g1 >> 6);
bgr[2] = Clip8(r1 >> 6);
}
#if !defined(WEBP_REDUCE_CSP)
static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
const int y1 = MultHi(y, 19077);
const int r1 = y1 + MultHi(v, 26149) - 14234;
const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
const int b1 = y1 + MultHi(u, 33050) - 17685;
const int r = Clip8(r1 >> 6);
const int g = Clip8(g1 >> 6);
const int b = Clip8(b1 >> 6);
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
rgb[0] = rg;
rgb[1] = gb;
#endif
}
static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
const int y1 = MultHi(y, 19077);
const int r1 = y1 + MultHi(v, 26149) - 14234;
const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
const int b1 = y1 + MultHi(u, 33050) - 17685;
const int r = Clip8(r1 >> 6);
const int g = Clip8(g1 >> 6);
const int b = Clip8(b1 >> 6);
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
argb[0] = rg;
argb[1] = ba;
#endif
}
static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
argb[0] = 0xff;
YuvToRgb(y, u, v, argb + 1);
}
#endif // WEBP_REDUCE_CSP
static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
YuvToBgr(y, u, v, bgra);
bgra[3] = 0xff;
}
static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
YuvToRgb(y, u, v, rgba);
rgba[3] = 0xff;
}
#if !defined(WEBP_REDUCE_CSP)
static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_3(R, G, B, dst);
y += 16;
u += 16;
v += 16;
dst += 16 * 3;
length -= 16;
}
if (length > 8) {
uint8_t temp[3 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
CALC_RGB16(temp, u, v, R, G, B);
STORE16_3(R, G, B, temp);
memcpy(dst, temp, length * 3 * sizeof(*dst));
} else if (length > 0) {
uint8_t temp[3 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
CALC_RGB8(temp, u, v, R, G, B);
STORE8_3(R, G, B, temp);
memcpy(dst, temp, length * 3 * sizeof(*dst));
}
}
static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_3(B, G, R, dst);
y += 16;
u += 16;
v += 16;
dst += 16 * 3;
length -= 16;
}
if (length > 8) {
uint8_t temp[3 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
CALC_RGB16(temp, u, v, R, G, B);
STORE16_3(B, G, R, temp);
memcpy(dst, temp, length * 3 * sizeof(*dst));
} else if (length > 0) {
uint8_t temp[3 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
CALC_RGB8(temp, u, v, R, G, B);
STORE8_3(B, G, R, temp);
memcpy(dst, temp, length * 3 * sizeof(*dst));
}
}
#endif // WEBP_REDUCE_CSP
static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(R, G, B, A, dst);
y += 16;
u += 16;
v += 16;
dst += 16 * 4;
length -= 16;
}
if (length > 8) {
uint8_t temp[4 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
CALC_RGB16(&temp[0], u, v, R, G, B);
STORE16_4(R, G, B, A, temp);
memcpy(dst, temp, length * 4 * sizeof(*dst));
} else if (length > 0) {
uint8_t temp[4 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
CALC_RGB8(temp, u, v, R, G, B);
STORE8_4(R, G, B, A, temp);
memcpy(dst, temp, length * 4 * sizeof(*dst));
}
}
static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(B, G, R, A, dst);
y += 16;
u += 16;
v += 16;
dst += 16 * 4;
length -= 16;
}
if (length > 8) {
uint8_t temp[4 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
CALC_RGB16(temp, u, v, R, G, B);
STORE16_4(B, G, R, A, temp);
memcpy(dst, temp, length * 4 * sizeof(*dst));
} else if (length > 0) {
uint8_t temp[4 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
CALC_RGB8(temp, u, v, R, G, B);
STORE8_4(B, G, R, A, temp);
memcpy(dst, temp, length * 4 * sizeof(*dst));
}
}
#if !defined(WEBP_REDUCE_CSP)
static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B;
const v16u8 A = (v16u8)__msa_ldi_b(ALPHAVAL);
while (length >= 16) {
CALC_RGB16(y, u, v, R, G, B);
STORE16_4(A, R, G, B, dst);
y += 16;
u += 16;
v += 16;
dst += 16 * 4;
length -= 16;
}
if (length > 8) {
uint8_t temp[4 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
CALC_RGB16(temp, u, v, R, G, B);
STORE16_4(A, R, G, B, temp);
memcpy(dst, temp, length * 4 * sizeof(*dst));
} else if (length > 0) {
uint8_t temp[4 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
CALC_RGB8(temp, u, v, R, G, B);
STORE8_4(A, R, G, B, temp);
memcpy(dst, temp, length * 4 * sizeof(*dst));
}
}
static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B, RG, BA, tmp0, tmp1;
while (length >= 16) {
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
#else
CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
#endif
y += 16;
u += 16;
v += 16;
dst += 16 * 2;
length -= 16;
}
if (length > 8) {
uint8_t temp[2 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
#else
CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
#endif
memcpy(dst, temp, length * 2 * sizeof(*dst));
} else if (length > 0) {
uint8_t temp[2 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
#else
CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
#endif
memcpy(dst, temp, length * 2 * sizeof(*dst));
}
}
static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst, int length) {
v16u8 R, G, B, RG, GB, tmp0, tmp1;
while (length >= 16) {
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(y, u, v, GB, RG, 16, dst);
#else
CALC_RGB565(y, u, v, RG, GB, 16, dst);
#endif
y += 16;
u += 16;
v += 16;
dst += 16 * 2;
length -= 16;
}
if (length > 8) {
uint8_t temp[2 * 16] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(temp, u, v, GB, RG, 16, temp);
#else
CALC_RGB565(temp, u, v, RG, GB, 16, temp);
#endif
memcpy(dst, temp, length * 2 * sizeof(*dst));
} else if (length > 0) {
uint8_t temp[2 * 8] = { 0 };
memcpy(temp, y, length * sizeof(*temp));
#if (WEBP_SWAP_16BIT_CSP == 1)
CALC_RGB565(temp, u, v, GB, RG, 8, temp);
#else
CALC_RGB565(temp, u, v, RG, GB, 8, temp);
#endif
memcpy(dst, temp, length * 2 * sizeof(*dst));
}
}
#endif // WEBP_REDUCE_CSP
#define UPSAMPLE_32PIXELS(a, b, c, d) do { \
v16u8 s = __msa_aver_u_b(a, d); \
v16u8 t = __msa_aver_u_b(b, c); \
const v16u8 st = s ^ t; \
v16u8 ad = a ^ d; \
v16u8 bc = b ^ c; \
v16u8 t0 = ad | bc; \
v16u8 t1 = t0 | st; \
v16u8 t2 = ANDI_B(t1, 1); \
v16u8 t3 = __msa_aver_u_b(s, t); \
const v16u8 k = t3 - t2; \
v16u8 diag1, diag2; \
AVER_UB2_UB(t, k, s, k, t0, t1); \
bc = bc & st; \
ad = ad & st; \
t = t ^ k; \
s = s ^ k; \
t2 = bc | t; \
t3 = ad | s; \
t2 = ANDI_B(t2, 1); \
t3 = ANDI_B(t3, 1); \
SUB2(t0, t2, t1, t3, diag1, diag2); \
AVER_UB2_UB(a, diag1, b, diag2, t0, t1); \
ILVRL_B2_UB(t1, t0, a, b); \
if (pbot_y != NULL) { \
AVER_UB2_UB(c, diag2, d, diag1, t0, t1); \
ILVRL_B2_UB(t1, t0, c, d); \
} \
} while (0)
#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bot_dst, int len) \
{ \
int size = (len - 1) >> 1; \
uint8_t temp_u[64]; \
uint8_t temp_v[64]; \
const uint32_t tl_uv = ((top_u[0]) | ((top_v[0]) << 16)); \
const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16)); \
const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \
const uint8_t* ptop_y = &top_y[1]; \
uint8_t* ptop_dst = top_dst + XSTEP; \
const uint8_t* pbot_y = &bot_y[1]; \
uint8_t* pbot_dst = bot_dst + XSTEP; \
\
FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \
if (bot_y != NULL) { \
const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \
FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst); \
} \
while (size >= 16) { \
v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1; \
LD_UB2(top_u, 1, tu0, tu1); \
LD_UB2(cur_u, 1, cu0, cu1); \
LD_UB2(top_v, 1, tv0, tv1); \
LD_UB2(cur_v, 1, cv0, cv1); \
UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1); \
UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1); \
ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16); \
ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16); \
FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32); \
if (bot_y != NULL) { \
FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32); \
} \
ptop_y += 32; \
pbot_y += 32; \
ptop_dst += XSTEP * 32; \
pbot_dst += XSTEP * 32; \
top_u += 16; \
top_v += 16; \
cur_u += 16; \
cur_v += 16; \
size -= 16; \
} \
if (size > 0) { \
v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1; \
memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t)); \
memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t)); \
memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t)); \
memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t)); \
LD_UB2(&temp_u[ 0], 1, tu0, tu1); \
LD_UB2(&temp_u[32], 1, cu0, cu1); \
LD_UB2(&temp_v[ 0], 1, tv0, tv1); \
LD_UB2(&temp_v[32], 1, cv0, cv1); \
UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1); \
UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1); \
ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16); \
ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16); \
FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2); \
if (bot_y != NULL) { \
FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2); \
} \
top_u += size; \
top_v += size; \
cur_u += size; \
cur_v += size; \
} \
if (!(len & 1)) { \
const uint32_t t0 = ((top_u[0]) | ((top_v[0]) << 16)); \
const uint32_t c0 = ((cur_u[0]) | ((cur_v[0]) << 16)); \
const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2; \
FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16), \
top_dst + (len - 1) * XSTEP); \
if (bot_y != NULL) { \
const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2; \
FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16), \
bot_dst + (len - 1) * XSTEP); \
} \
} \
}
UPSAMPLE_FUNC(UpsampleRgbaLinePair, YuvToRgba, 4)
UPSAMPLE_FUNC(UpsampleBgraLinePair, YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
UPSAMPLE_FUNC(UpsampleRgbLinePair, YuvToRgb, 3)
UPSAMPLE_FUNC(UpsampleBgrLinePair, YuvToBgr, 3)
UPSAMPLE_FUNC(UpsampleArgbLinePair, YuvToArgb, 4)
UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
UPSAMPLE_FUNC(UpsampleRgb565LinePair, YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
//------------------------------------------------------------------------------
// Entry point
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersMSA(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
#endif // WEBP_USE_MSA
#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA))
WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA)
#endif

View File

@ -0,0 +1,285 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// NEON version of YUV to RGB upsampling functions.
//
// Author: mans@mansr.com (Mans Rullgard)
// Based on SSE code by: somnath@google.com (Somnath Banerjee)
#include "dsp_dsp.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include <arm_neon.h>
#include <string.h>
#include "dsp_neon.h"
#include "dsp_yuv.h"
#ifdef FANCY_UPSAMPLING
//-----------------------------------------------------------------------------
// U/V upsampling
// Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
#define UPSAMPLE_16PIXELS(r1, r2, out) do { \
const uint8x8_t a = vld1_u8(r1 + 0); \
const uint8x8_t b = vld1_u8(r1 + 1); \
const uint8x8_t c = vld1_u8(r2 + 0); \
const uint8x8_t d = vld1_u8(r2 + 1); \
/* a + b + c + d */ \
const uint16x8_t ad = vaddl_u8(a, d); \
const uint16x8_t bc = vaddl_u8(b, c); \
const uint16x8_t abcd = vaddq_u16(ad, bc); \
/* 3a + b + c + 3d */ \
const uint16x8_t al = vaddq_u16(abcd, vshlq_n_u16(ad, 1)); \
/* a + 3b + 3c + d */ \
const uint16x8_t bl = vaddq_u16(abcd, vshlq_n_u16(bc, 1)); \
\
const uint8x8_t diag2 = vshrn_n_u16(al, 3); \
const uint8x8_t diag1 = vshrn_n_u16(bl, 3); \
\
const uint8x8_t A = vrhadd_u8(a, diag1); \
const uint8x8_t B = vrhadd_u8(b, diag2); \
const uint8x8_t C = vrhadd_u8(c, diag2); \
const uint8x8_t D = vrhadd_u8(d, diag1); \
\
uint8x8x2_t A_B, C_D; \
INIT_VECTOR2(A_B, A, B); \
INIT_VECTOR2(C_D, C, D); \
vst2_u8(out + 0, A_B); \
vst2_u8(out + 32, C_D); \
} while (0)
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample16Pixels_NEON(const uint8_t* r1, const uint8_t* r2,
uint8_t* out) {
UPSAMPLE_16PIXELS(r1, r2, out);
}
#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \
uint8_t r1[9], r2[9]; \
memcpy(r1, (tb), (num_pixels)); \
memcpy(r2, (bb), (num_pixels)); \
/* replicate last byte */ \
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 9 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 9 - (num_pixels)); \
Upsample16Pixels_NEON(r1, r2, out); \
}
//-----------------------------------------------------------------------------
// YUV->RGB conversion
// note: we represent the 33050 large constant as 32768 + 282
static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
#define v255 vdup_n_u8(255)
#define STORE_Rgb(out, r, g, b) do { \
uint8x8x3_t r_g_b; \
INIT_VECTOR3(r_g_b, r, g, b); \
vst3_u8(out, r_g_b); \
} while (0)
#define STORE_Bgr(out, r, g, b) do { \
uint8x8x3_t b_g_r; \
INIT_VECTOR3(b_g_r, b, g, r); \
vst3_u8(out, b_g_r); \
} while (0)
#define STORE_Rgba(out, r, g, b) do { \
uint8x8x4_t r_g_b_v255; \
INIT_VECTOR4(r_g_b_v255, r, g, b, v255); \
vst4_u8(out, r_g_b_v255); \
} while (0)
#define STORE_Bgra(out, r, g, b) do { \
uint8x8x4_t b_g_r_v255; \
INIT_VECTOR4(b_g_r_v255, b, g, r, v255); \
vst4_u8(out, b_g_r_v255); \
} while (0)
#define STORE_Argb(out, r, g, b) do { \
uint8x8x4_t v255_r_g_b; \
INIT_VECTOR4(v255_r_g_b, v255, r, g, b); \
vst4_u8(out, v255_r_g_b); \
} while (0)
#if !defined(WEBP_SWAP_16BIT_CSP)
#define ZIP_U8(lo, hi) vzip_u8((lo), (hi))
#else
#define ZIP_U8(lo, hi) vzip_u8((hi), (lo))
#endif
#define STORE_Rgba4444(out, r, g, b) do { \
const uint8x8_t rg = vsri_n_u8(r, g, 4); /* shift g, insert r */ \
const uint8x8_t ba = vsri_n_u8(b, v255, 4); /* shift a, insert b */ \
const uint8x8x2_t rgba4444 = ZIP_U8(rg, ba); \
vst1q_u8(out, vcombine_u8(rgba4444.val[0], rgba4444.val[1])); \
} while (0)
#define STORE_Rgb565(out, r, g, b) do { \
const uint8x8_t rg = vsri_n_u8(r, g, 5); /* shift g and insert r */ \
const uint8x8_t g1 = vshl_n_u8(g, 3); /* pre-shift g: 3bits */ \
const uint8x8_t gb = vsri_n_u8(g1, b, 3); /* shift b and insert g */ \
const uint8x8x2_t rgb565 = ZIP_U8(rg, gb); \
vst1q_u8(out, vcombine_u8(rgb565.val[0], rgb565.val[1])); \
} while (0)
#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) do { \
int i; \
for (i = 0; i < N; i += 8) { \
const int off = ((cur_x) + i) * XSTEP; \
const uint8x8_t y = vld1_u8((src_y) + (cur_x) + i); \
const uint8x8_t u = vld1_u8((src_uv) + i + 0); \
const uint8x8_t v = vld1_u8((src_uv) + i + 16); \
const int16x8_t Y0 = vreinterpretq_s16_u16(vshll_n_u8(y, 7)); \
const int16x8_t U0 = vreinterpretq_s16_u16(vshll_n_u8(u, 7)); \
const int16x8_t V0 = vreinterpretq_s16_u16(vshll_n_u8(v, 7)); \
const int16x8_t Y1 = vqdmulhq_lane_s16(Y0, coeff1, 0); \
const int16x8_t R0 = vqdmulhq_lane_s16(V0, coeff1, 1); \
const int16x8_t G0 = vqdmulhq_lane_s16(U0, coeff1, 2); \
const int16x8_t G1 = vqdmulhq_lane_s16(V0, coeff1, 3); \
const int16x8_t B0 = vqdmulhq_n_s16(U0, 282); \
const int16x8_t R1 = vqaddq_s16(Y1, R_Rounder); \
const int16x8_t G2 = vqaddq_s16(Y1, G_Rounder); \
const int16x8_t B1 = vqaddq_s16(Y1, B_Rounder); \
const int16x8_t R2 = vqaddq_s16(R0, R1); \
const int16x8_t G3 = vqaddq_s16(G0, G1); \
const int16x8_t B2 = vqaddq_s16(B0, B1); \
const int16x8_t G4 = vqsubq_s16(G2, G3); \
const int16x8_t B3 = vqaddq_s16(B2, U0); \
const uint8x8_t R = vqshrun_n_s16(R2, YUV_FIX2); \
const uint8x8_t G = vqshrun_n_s16(G4, YUV_FIX2); \
const uint8x8_t B = vqshrun_n_s16(B3, YUV_FIX2); \
STORE_ ## FMT(out + off, R, G, B); \
} \
} while (0)
#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) { \
int i; \
for (i = 0; i < N; i++) { \
const int off = ((cur_x) + i) * XSTEP; \
const int y = src_y[(cur_x) + i]; \
const int u = (src_uv)[i]; \
const int v = (src_uv)[i + 16]; \
FUNC(y, u, v, rgb + off); \
} \
}
#define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv, \
top_dst, bottom_dst, cur_x, len) { \
CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x); \
if (bottom_y != NULL) { \
CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
} \
}
#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv, \
top_dst, bottom_dst, cur_x, len) { \
CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x); \
if (bottom_y != NULL) { \
CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
} \
}
#define NEON_UPSAMPLE_FUNC(FUNC_NAME, FMT, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int block; \
/* 16 byte aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[2 * 32 + 15]; \
uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
const int uv_len = (len + 1) >> 1; \
/* 9 pixels must be read-able for each block */ \
const int num_blocks = (uv_len - 1) >> 3; \
const int leftover = uv_len - num_blocks * 8; \
const int last_pos = 1 + 16 * num_blocks; \
\
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
\
const int16x4_t coeff1 = vld1_s16(kCoeffs1); \
const int16x8_t R_Rounder = vdupq_n_s16(-14234); \
const int16x8_t G_Rounder = vdupq_n_s16(8708); \
const int16x8_t B_Rounder = vdupq_n_s16(-17685); \
\
/* Treat the first pixel in regular way */ \
assert(top_y != NULL); \
{ \
const int u0 = (top_u[0] + u_diag) >> 1; \
const int v0 = (top_v[0] + v_diag) >> 1; \
VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst); \
} \
if (bottom_y != NULL) { \
const int u0 = (cur_u[0] + u_diag) >> 1; \
const int v0 = (cur_v[0] + v_diag) >> 1; \
VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst); \
} \
\
for (block = 0; block < num_blocks; ++block) { \
UPSAMPLE_16PIXELS(top_u, cur_u, r_uv); \
UPSAMPLE_16PIXELS(top_v, cur_v, r_uv + 16); \
CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, r_uv, \
top_dst, bottom_dst, 16 * block + 1, 16); \
top_u += 8; \
cur_u += 8; \
top_v += 8; \
cur_v += 8; \
} \
\
UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv); \
UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16); \
CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv, \
top_dst, bottom_dst, last_pos, len - last_pos); \
}
// NEON variants of the fancy upsampler.
NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair_NEON, Rgba, 4)
NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair_NEON, Bgra, 4)
#if !defined(WEBP_REDUCE_CSP)
NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair_NEON, Rgb, 3)
NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair_NEON, Bgr, 3)
NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair_NEON, Argb, 4)
NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_NEON, Rgba4444, 2)
NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair_NEON, Rgb565, 2)
#endif // WEBP_REDUCE_CSP
//------------------------------------------------------------------------------
// Entry point
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) {
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_NEON;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_NEON;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_NEON;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_NEON;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_NEON;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_NEON;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_NEON;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_NEON;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_NEON;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_NEON;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_NEON;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
#endif // WEBP_USE_NEON
#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_NEON))
WEBP_DSP_INIT_STUB(WebPInitUpsamplersNEON)
#endif

View File

@ -0,0 +1,267 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE2 version of YUV to RGB upsampling functions.
//
// Author: somnath@google.com (Somnath Banerjee)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE2)
#include <assert.h>
#include <emmintrin.h>
#include <string.h>
#include "dsp_yuv.h"
#ifdef FANCY_UPSAMPLING
// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
// u = (9*a + 3*b + 3*c + d + 8) / 16
// = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
// = (a + m + 1) / 2
// where m = (a + 3*b + 3*c + d) / 8
// = ((a + b + c + d) / 2 + b + c) / 4
//
// Let's say k = (a + b + c + d) / 4.
// We can compute k as
// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
//
// Then m can be written as
// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
#define GET_M(ij, in, out) do { \
const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \
const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \
const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \
const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) | (k^in) */\
const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \
(out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \
} while (0)
// pack and store two alternating pixel rows
#define PACK_AND_STORE(a, b, da, db, out) do { \
const __m128i t_a = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \
const __m128i t_b = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \
const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b); \
const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b); \
_mm_store_si128(((__m128i*)(out)) + 0, t_1); \
_mm_store_si128(((__m128i*)(out)) + 1, t_2); \
} while (0)
// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
#define UPSAMPLE_32PIXELS(r1, r2, out) { \
const __m128i one = _mm_set1_epi8(1); \
const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]); \
const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]); \
const __m128i c = _mm_loadu_si128((const __m128i*)&(r2)[0]); \
const __m128i d = _mm_loadu_si128((const __m128i*)&(r2)[1]); \
\
const __m128i s = _mm_avg_epu8(a, d); /* s = (a + d + 1) / 2 */ \
const __m128i t = _mm_avg_epu8(b, c); /* t = (b + c + 1) / 2 */ \
const __m128i st = _mm_xor_si128(s, t); /* st = s^t */ \
\
const __m128i ad = _mm_xor_si128(a, d); /* ad = a^d */ \
const __m128i bc = _mm_xor_si128(b, c); /* bc = b^c */ \
\
const __m128i t1 = _mm_or_si128(ad, bc); /* (a^d) | (b^c) */ \
const __m128i t2 = _mm_or_si128(t1, st); /* (a^d) | (b^c) | (s^t) */ \
const __m128i t3 = _mm_and_si128(t2, one); /* (a^d) | (b^c) | (s^t) & 1 */ \
const __m128i t4 = _mm_avg_epu8(s, t); \
const __m128i k = _mm_sub_epi8(t4, t3); /* k = (a + b + c + d) / 4 */ \
__m128i diag1, diag2; \
\
GET_M(bc, t, diag1); /* diag1 = (a + 3b + 3c + d) / 8 */ \
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
\
/* pack the alternate pixels */ \
PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \
PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \
}
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \
uint8_t r1[17], r2[17]; \
memcpy(r1, (tb), (num_pixels)); \
memcpy(r2, (bb), (num_pixels)); \
/* replicate last byte */ \
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \
/* using the shared function instead of the macro saves ~3k code size */ \
Upsample32Pixels_SSE2(r1, r2, out); \
}
#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x) do { \
FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \
if ((bottom_y) != NULL) { \
FUNC##32_SSE2((bottom_y) + (cur_x), r_u + 64, r_v + 64, \
(bottom_dst) + (cur_x) * (XSTEP)); \
} \
} while (0)
#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
uint8_t* const r_v = r_u + 32; \
\
assert(top_y != NULL); \
{ /* Treat the first pixel in regular way */ \
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
const int u0_t = (top_u[0] + u_diag) >> 1; \
const int v0_t = (top_v[0] + v_diag) >> 1; \
FUNC(top_y[0], u0_t, v0_t, top_dst); \
if (bottom_y != NULL) { \
const int u0_b = (cur_u[0] + u_diag) >> 1; \
const int v0_b = (cur_v[0] + v_diag) >> 1; \
FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \
} \
} \
/* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \
for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \
UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \
UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \
CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \
} \
if (len > 1) { \
const int left_over = ((len + 1) >> 1) - (pos >> 1); \
uint8_t* const tmp_top_dst = r_u + 4 * 32; \
uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32; \
uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32; \
uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32; \
assert(left_over > 0); \
UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \
UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \
memcpy(tmp_top, top_y + pos, len - pos); \
if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos); \
CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst, \
tmp_bottom_dst, 0); \
memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP)); \
if (bottom_y != NULL) { \
memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst, \
(len - pos) * (XSTEP)); \
} \
} \
}
// SSE2 variants of the fancy upsampler.
SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair_SSE2, VP8YuvToRgba, 4)
SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair_SSE2, VP8YuvToBgra, 4)
#if !defined(WEBP_REDUCE_CSP)
SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE2, VP8YuvToRgb, 3)
SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE2, VP8YuvToBgr, 3)
SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair_SSE2, VP8YuvToArgb, 4)
SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair_SSE2, VP8YuvToRgba4444, 2)
SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair_SSE2, VP8YuvToRgb565, 2)
#endif // WEBP_REDUCE_CSP
#undef GET_M
#undef PACK_AND_STORE
#undef UPSAMPLE_32PIXELS
#undef UPSAMPLE_LAST_BLOCK
#undef CONVERT2RGB
#undef CONVERT2RGB_32
#undef SSE2_UPSAMPLE_FUNC
//------------------------------------------------------------------------------
// Entry point
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair_SSE2;
WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair_SSE2;
WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair_SSE2;
WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair_SSE2;
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE2;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE2;
WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair_SSE2;
WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair_SSE2;
WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair_SSE2;
WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair_SSE2;
WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair_SSE2;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
//------------------------------------------------------------------------------
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE2(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \
CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \
} \
if (i < len) { /* C-fallback */ \
CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \
} \
}
YUV444_FUNC(Yuv444ToRgba_SSE2, VP8YuvToRgba32_SSE2, WebPYuv444ToRgba_C, 4);
YUV444_FUNC(Yuv444ToBgra_SSE2, VP8YuvToBgra32_SSE2, WebPYuv444ToBgra_C, 4);
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(Yuv444ToRgb_SSE2, VP8YuvToRgb32_SSE2, WebPYuv444ToRgb_C, 3);
YUV444_FUNC(Yuv444ToBgr_SSE2, VP8YuvToBgr32_SSE2, WebPYuv444ToBgr_C, 3);
YUV444_FUNC(Yuv444ToArgb_SSE2, VP8YuvToArgb32_SSE2, WebPYuv444ToArgb_C, 4)
YUV444_FUNC(Yuv444ToRgba4444_SSE2, VP8YuvToRgba444432_SSE2, \
WebPYuv444ToRgba4444_C, 2)
YUV444_FUNC(Yuv444ToRgb565_SSE2, VP8YuvToRgb56532_SSE2, WebPYuv444ToRgb565_C, 2)
#endif // WEBP_REDUCE_CSP
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba_SSE2;
WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra_SSE2;
WebPYUV444Converters[MODE_rgbA] = Yuv444ToRgba_SSE2;
WebPYUV444Converters[MODE_bgrA] = Yuv444ToBgra_SSE2;
#if !defined(WEBP_REDUCE_CSP)
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE2;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE2;
WebPYUV444Converters[MODE_ARGB] = Yuv444ToArgb_SSE2;
WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444_SSE2;
WebPYUV444Converters[MODE_RGB_565] = Yuv444ToRgb565_SSE2;
WebPYUV444Converters[MODE_Argb] = Yuv444ToArgb_SSE2;
WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444_SSE2;
#endif // WEBP_REDUCE_CSP
}
#else
WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersSSE2)
#endif // WEBP_USE_SSE2
#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_SSE2))
WEBP_DSP_INIT_STUB(WebPInitUpsamplersSSE2)
#endif

View File

@ -0,0 +1,239 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// SSE41 version of YUV to RGB upsampling functions.
//
// Author: somnath@google.com (Somnath Banerjee)
#include "dsp_dsp.h"
#if defined(WEBP_USE_SSE41)
#include <assert.h>
#include <smmintrin.h>
#include <string.h>
#include "dsp_yuv.h"
#ifdef FANCY_UPSAMPLING
#if !defined(WEBP_REDUCE_CSP)
// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
// u = (9*a + 3*b + 3*c + d + 8) / 16
// = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
// = (a + m + 1) / 2
// where m = (a + 3*b + 3*c + d) / 8
// = ((a + b + c + d) / 2 + b + c) / 4
//
// Let's say k = (a + b + c + d) / 4.
// We can compute k as
// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
//
// Then m can be written as
// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
#define GET_M(ij, in, out) do { \
const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \
const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \
const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \
const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) | (k^in) */\
const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \
(out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \
} while (0)
// pack and store two alternating pixel rows
#define PACK_AND_STORE(a, b, da, db, out) do { \
const __m128i t_a = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \
const __m128i t_b = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \
const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b); \
const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b); \
_mm_store_si128(((__m128i*)(out)) + 0, t_1); \
_mm_store_si128(((__m128i*)(out)) + 1, t_2); \
} while (0)
// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
#define UPSAMPLE_32PIXELS(r1, r2, out) { \
const __m128i one = _mm_set1_epi8(1); \
const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]); \
const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]); \
const __m128i c = _mm_loadu_si128((const __m128i*)&(r2)[0]); \
const __m128i d = _mm_loadu_si128((const __m128i*)&(r2)[1]); \
\
const __m128i s = _mm_avg_epu8(a, d); /* s = (a + d + 1) / 2 */ \
const __m128i t = _mm_avg_epu8(b, c); /* t = (b + c + 1) / 2 */ \
const __m128i st = _mm_xor_si128(s, t); /* st = s^t */ \
\
const __m128i ad = _mm_xor_si128(a, d); /* ad = a^d */ \
const __m128i bc = _mm_xor_si128(b, c); /* bc = b^c */ \
\
const __m128i t1 = _mm_or_si128(ad, bc); /* (a^d) | (b^c) */ \
const __m128i t2 = _mm_or_si128(t1, st); /* (a^d) | (b^c) | (s^t) */ \
const __m128i t3 = _mm_and_si128(t2, one); /* (a^d) | (b^c) | (s^t) & 1 */ \
const __m128i t4 = _mm_avg_epu8(s, t); \
const __m128i k = _mm_sub_epi8(t4, t3); /* k = (a + b + c + d) / 4 */ \
__m128i diag1, diag2; \
\
GET_M(bc, t, diag1); /* diag1 = (a + 3b + 3c + d) / 8 */ \
GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \
\
/* pack the alternate pixels */ \
PACK_AND_STORE(a, b, diag1, diag2, (out) + 0); /* store top */ \
PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32); /* store bottom */ \
}
// Turn the macro into a function for reducing code-size when non-critical
static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
uint8_t* const out) {
UPSAMPLE_32PIXELS(r1, r2, out);
}
#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \
uint8_t r1[17], r2[17]; \
memcpy(r1, (tb), (num_pixels)); \
memcpy(r2, (bb), (num_pixels)); \
/* replicate last byte */ \
memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \
memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \
/* using the shared function instead of the macro saves ~3k code size */ \
Upsample32Pixels_SSE41(r1, r2, out); \
}
#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, \
top_dst, bottom_dst, cur_x) do { \
FUNC##32_SSE41((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP)); \
if ((bottom_y) != NULL) { \
FUNC##32_SSE41((bottom_y) + (cur_x), r_u + 64, r_v + 64, \
(bottom_dst) + (cur_x) * (XSTEP)); \
} \
} while (0)
#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \
const uint8_t* top_u, const uint8_t* top_v, \
const uint8_t* cur_u, const uint8_t* cur_v, \
uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
int uv_pos, pos; \
/* 16byte-aligned array to cache reconstructed u and v */ \
uint8_t uv_buf[14 * 32 + 15] = { 0 }; \
uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \
uint8_t* const r_v = r_u + 32; \
\
assert(top_y != NULL); \
{ /* Treat the first pixel in regular way */ \
const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \
const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \
const int u0_t = (top_u[0] + u_diag) >> 1; \
const int v0_t = (top_v[0] + v_diag) >> 1; \
FUNC(top_y[0], u0_t, v0_t, top_dst); \
if (bottom_y != NULL) { \
const int u0_b = (cur_u[0] + u_diag) >> 1; \
const int v0_b = (cur_v[0] + v_diag) >> 1; \
FUNC(bottom_y[0], u0_b, v0_b, bottom_dst); \
} \
} \
/* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */ \
for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) { \
UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u); \
UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v); \
CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos); \
} \
if (len > 1) { \
const int left_over = ((len + 1) >> 1) - (pos >> 1); \
uint8_t* const tmp_top_dst = r_u + 4 * 32; \
uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32; \
uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32; \
uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32; \
assert(left_over > 0); \
UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u); \
UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v); \
memcpy(tmp_top, top_y + pos, len - pos); \
if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos); \
CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst, \
tmp_bottom_dst, 0); \
memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP)); \
if (bottom_y != NULL) { \
memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst, \
(len - pos) * (XSTEP)); \
} \
} \
}
// SSE4 variants of the fancy upsampler.
SSE4_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE41, VP8YuvToRgb, 3)
SSE4_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE41, VP8YuvToBgr, 3)
#undef GET_M
#undef PACK_AND_STORE
#undef UPSAMPLE_32PIXELS
#undef UPSAMPLE_LAST_BLOCK
#undef CONVERT2RGB
#undef CONVERT2RGB_32
#undef SSE4_UPSAMPLE_FUNC
#endif // WEBP_REDUCE_CSP
//------------------------------------------------------------------------------
// Entry point
extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
extern void WebPInitUpsamplersSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE41(void) {
#if !defined(WEBP_REDUCE_CSP)
WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair_SSE41;
WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair_SSE41;
#endif // WEBP_REDUCE_CSP
}
#endif // FANCY_UPSAMPLING
//------------------------------------------------------------------------------
extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
extern void WebPInitYUV444ConvertersSSE41(void);
#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP) \
extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len); \
static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
const int max_len = len & ~31; \
for (i = 0; i < max_len; i += 32) { \
CALL(y + i, u + i, v + i, dst + i * (XSTEP)); \
} \
if (i < len) { /* C-fallback */ \
CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i); \
} \
}
#if !defined(WEBP_REDUCE_CSP)
YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3);
YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3);
#endif // WEBP_REDUCE_CSP
WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE41(void) {
#if !defined(WEBP_REDUCE_CSP)
WebPYUV444Converters[MODE_RGB] = Yuv444ToRgb_SSE41;
WebPYUV444Converters[MODE_BGR] = Yuv444ToBgr_SSE41;
#endif // WEBP_REDUCE_CSP
}
#else
WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersSSE41)
#endif // WEBP_USE_SSE41
#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_SSE41))
WEBP_DSP_INIT_STUB(WebPInitUpsamplersSSE41)
#endif

308
vendor/github.com/sizeofint/webpanimation/dsp_yuv.c generated vendored Normal file
View File

@ -0,0 +1,308 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV->RGB conversion functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_yuv.h"
#include <assert.h>
#include <stdlib.h>
//-----------------------------------------------------------------------------
// Plain-C version
#define ROW_FUNC(FUNC_NAME, FUNC, XSTEP) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
const uint8_t* const end = dst + (len & ~1) * (XSTEP); \
while (dst != end) { \
FUNC(y[0], u[0], v[0], dst); \
FUNC(y[1], u[0], v[0], dst + (XSTEP)); \
y += 2; \
++u; \
++v; \
dst += 2 * (XSTEP); \
} \
if (len & 1) { \
FUNC(y[0], u[0], v[0], dst); \
} \
} \
// All variants implemented.
ROW_FUNC(YuvToRgbRow, VP8YuvToRgb, 3)
ROW_FUNC(YuvToBgrRow, VP8YuvToBgr, 3)
ROW_FUNC(YuvToRgbaRow, VP8YuvToRgba, 4)
ROW_FUNC(YuvToBgraRow, VP8YuvToBgra, 4)
ROW_FUNC(YuvToArgbRow, VP8YuvToArgb, 4)
ROW_FUNC(YuvToRgba4444Row, VP8YuvToRgba4444, 2)
ROW_FUNC(YuvToRgb565Row, VP8YuvToRgb565, 2)
#undef ROW_FUNC
// Main call for processing a plane with a WebPSamplerRowFunc function:
void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
const uint8_t* u, const uint8_t* v, int uv_stride,
uint8_t* dst, int dst_stride,
int width, int height, WebPSamplerRowFunc func) {
int j;
for (j = 0; j < height; ++j) {
func(y, u, v, dst, width);
y += y_stride;
if (j & 1) {
u += uv_stride;
v += uv_stride;
}
dst += dst_stride;
}
}
//-----------------------------------------------------------------------------
// Main call
WebPSamplerRowFunc WebPSamplers[MODE_LAST];
extern void WebPInitSamplersSSE2(void);
extern void WebPInitSamplersSSE41(void);
extern void WebPInitSamplersMIPS32(void);
extern void WebPInitSamplersMIPSdspR2(void);
WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
WebPSamplers[MODE_RGB] = YuvToRgbRow;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
WebPSamplers[MODE_BGR] = YuvToBgrRow;
WebPSamplers[MODE_BGRA] = YuvToBgraRow;
WebPSamplers[MODE_ARGB] = YuvToArgbRow;
WebPSamplers[MODE_RGBA_4444] = YuvToRgba4444Row;
WebPSamplers[MODE_RGB_565] = YuvToRgb565Row;
WebPSamplers[MODE_rgbA] = YuvToRgbaRow;
WebPSamplers[MODE_bgrA] = YuvToBgraRow;
WebPSamplers[MODE_Argb] = YuvToArgbRow;
WebPSamplers[MODE_rgbA_4444] = YuvToRgba4444Row;
// If defined, use CPUInfo() to overwrite some pointers with faster versions.
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitSamplersSSE2();
}
#endif // WEBP_USE_SSE2
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitSamplersSSE41();
}
#endif // WEBP_USE_SSE41
#if defined(WEBP_USE_MIPS32)
if (VP8GetCPUInfo(kMIPS32)) {
WebPInitSamplersMIPS32();
}
#endif // WEBP_USE_MIPS32
#if defined(WEBP_USE_MIPS_DSP_R2)
if (VP8GetCPUInfo(kMIPSdspR2)) {
WebPInitSamplersMIPSdspR2();
}
#endif // WEBP_USE_MIPS_DSP_R2
}
}
//-----------------------------------------------------------------------------
// ARGB -> YUV converters
static void ConvertARGBToY_C(const uint32_t* argb, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i) {
const uint32_t p = argb[i];
y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
YUV_HALF);
}
}
void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store) {
// No rounding. Last pixel is dealt with separately.
const int uv_width = src_width >> 1;
int i;
for (i = 0; i < uv_width; ++i) {
const uint32_t v0 = argb[2 * i + 0];
const uint32_t v1 = argb[2 * i + 1];
// VP8RGBToU/V expects four accumulated pixels. Hence we need to
// scale r/g/b value by a factor 2. We just shift v0/v1 one bit less.
const int r = ((v0 >> 15) & 0x1fe) + ((v1 >> 15) & 0x1fe);
const int g = ((v0 >> 7) & 0x1fe) + ((v1 >> 7) & 0x1fe);
const int b = ((v0 << 1) & 0x1fe) + ((v1 << 1) & 0x1fe);
const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
if (do_store) {
u[i] = tmp_u;
v[i] = tmp_v;
} else {
// Approximated average-of-four. But it's an acceptable diff.
u[i] = (u[i] + tmp_u + 1) >> 1;
v[i] = (v[i] + tmp_v + 1) >> 1;
}
}
if (src_width & 1) { // last pixel
const uint32_t v0 = argb[2 * i + 0];
const int r = (v0 >> 14) & 0x3fc;
const int g = (v0 >> 6) & 0x3fc;
const int b = (v0 << 2) & 0x3fc;
const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
if (do_store) {
u[i] = tmp_u;
v[i] = tmp_v;
} else {
u[i] = (u[i] + tmp_u + 1) >> 1;
v[i] = (v[i] + tmp_v + 1) >> 1;
}
}
}
//-----------------------------------------------------------------------------
static void ConvertRGB24ToY_C(const uint8_t* rgb, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i, rgb += 3) {
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY_C(const uint8_t* bgr, uint8_t* y, int width) {
int i;
for (i = 0; i < width; ++i, bgr += 3) {
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
}
}
void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
int i;
for (i = 0; i < width; i += 1, rgb += 4) {
const int r = rgb[0], g = rgb[1], b = rgb[2];
u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
}
}
//-----------------------------------------------------------------------------
#if !WEBP_NEON_OMIT_C_CODE
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
uint64_t diff = 0;
int i;
for (i = 0; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)dst[i] + diff_y;
dst[i] = clip_y(new_y);
diff += (uint64_t)abs(diff_y);
}
return diff;
}
static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
for (i = 0; i < len; ++i, ++A, ++B) {
const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
}
}
#endif // !WEBP_NEON_OMIT_C_CODE
#undef MAX_Y
//-----------------------------------------------------------------------------
void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width);
void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store);
uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len);
void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
int16_t* dst, int len);
void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out);
extern void WebPInitConvertARGBToYUVSSE2(void);
extern void WebPInitConvertARGBToYUVSSE41(void);
extern void WebPInitConvertARGBToYUVNEON(void);
extern void WebPInitSharpYUVSSE2(void);
extern void WebPInitSharpYUVNEON(void);
WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
WebPConvertARGBToY = ConvertARGBToY_C;
WebPConvertARGBToUV = WebPConvertARGBToUV_C;
WebPConvertRGB24ToY = ConvertRGB24ToY_C;
WebPConvertBGR24ToY = ConvertBGR24ToY_C;
WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
#if !WEBP_NEON_OMIT_C_CODE
WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
#endif
if (VP8GetCPUInfo != NULL) {
#if defined(WEBP_USE_SSE2)
if (VP8GetCPUInfo(kSSE2)) {
WebPInitConvertARGBToYUVSSE2();
WebPInitSharpYUVSSE2();
}
#endif // WEBP_USE_SSE2
#if defined(WEBP_USE_SSE41)
if (VP8GetCPUInfo(kSSE4_1)) {
WebPInitConvertARGBToYUVSSE41();
}
#endif // WEBP_USE_SSE41
}
#if defined(WEBP_USE_NEON)
if (WEBP_NEON_OMIT_C_CODE ||
(VP8GetCPUInfo != NULL && VP8GetCPUInfo(kNEON))) {
WebPInitConvertARGBToYUVNEON();
WebPInitSharpYUVNEON();
}
#endif // WEBP_USE_NEON
assert(WebPConvertARGBToY != NULL);
assert(WebPConvertARGBToUV != NULL);
assert(WebPConvertRGB24ToY != NULL);
assert(WebPConvertBGR24ToY != NULL);
assert(WebPConvertRGBA32ToUV != NULL);
assert(WebPSharpYUVUpdateY != NULL);
assert(WebPSharpYUVUpdateRGB != NULL);
assert(WebPSharpYUVFilterRow != NULL);
}

210
vendor/github.com/sizeofint/webpanimation/dsp_yuv.h generated vendored Normal file
View File

@ -0,0 +1,210 @@
// Copyright 2010 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// inline YUV<->RGB conversion function
//
// The exact naming is Y'CbCr, following the ITU-R BT.601 standard.
// More information at: http://en.wikipedia.org/wiki/YCbCr
// Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
// U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
// V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
//
// For the Y'CbCr to RGB conversion, the BT.601 specification reads:
// R = 1.164 * (Y-16) + 1.596 * (V-128)
// G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128)
// B = 1.164 * (Y-16) + 2.018 * (U-128)
// where Y is in the [16,235] range, and U/V in the [16,240] range.
//
// The fixed-point implementation used here is:
// R = (19077 . y + 26149 . v - 14234) >> 6
// G = (19077 . y - 6419 . u - 13320 . v + 8708) >> 6
// B = (19077 . y + 33050 . u - 17685) >> 6
// where the '.' operator is the mulhi_epu16 variant:
// a . b = ((a << 8) * b) >> 16
// that preserves 8 bits of fractional precision before final descaling.
// Author: Skal (pascal.massimino@gmail.com)
#ifndef WEBP_DSP_YUV_H_
#define WEBP_DSP_YUV_H_
#include "dsp_dsp.h"
#include "dec_vp8_dec.h"
//------------------------------------------------------------------------------
// YUV -> RGB conversion
#ifdef __cplusplus
extern "C" {
#endif
enum {
YUV_FIX = 16, // fixed-point precision for RGB->YUV
YUV_HALF = 1 << (YUV_FIX - 1),
YUV_FIX2 = 6, // fixed-point precision for YUV->RGB
YUV_MASK2 = (256 << YUV_FIX2) - 1
};
//------------------------------------------------------------------------------
// slower on x86 by ~7-8%, but bit-exact with the SSE2/NEON version
static WEBP_INLINE int MultHi(int v, int coeff) { // _mm_mulhi_epu16 emulation
return (v * coeff) >> 8;
}
static WEBP_INLINE int VP8Clip8(int v) {
return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255;
}
static WEBP_INLINE int VP8YUVToR(int y, int v) {
return VP8Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234);
}
static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
return VP8Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708);
}
static WEBP_INLINE int VP8YUVToB(int y, int u) {
return VP8Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685);
}
static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
uint8_t* const rgb) {
rgb[0] = VP8YUVToR(y, v);
rgb[1] = VP8YUVToG(y, u, v);
rgb[2] = VP8YUVToB(y, u);
}
static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
uint8_t* const bgr) {
bgr[0] = VP8YUVToB(y, u);
bgr[1] = VP8YUVToG(y, u, v);
bgr[2] = VP8YUVToR(y, v);
}
static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
uint8_t* const rgb) {
const int r = VP8YUVToR(y, v); // 5 usable bits
const int g = VP8YUVToG(y, u, v); // 6 usable bits
const int b = VP8YUVToB(y, u); // 5 usable bits
const int rg = (r & 0xf8) | (g >> 5);
const int gb = ((g << 3) & 0xe0) | (b >> 3);
#if (WEBP_SWAP_16BIT_CSP == 1)
rgb[0] = gb;
rgb[1] = rg;
#else
rgb[0] = rg;
rgb[1] = gb;
#endif
}
static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
uint8_t* const argb) {
const int r = VP8YUVToR(y, v); // 4 usable bits
const int g = VP8YUVToG(y, u, v); // 4 usable bits
const int b = VP8YUVToB(y, u); // 4 usable bits
const int rg = (r & 0xf0) | (g >> 4);
const int ba = (b & 0xf0) | 0x0f; // overwrite the lower 4 bits
#if (WEBP_SWAP_16BIT_CSP == 1)
argb[0] = ba;
argb[1] = rg;
#else
argb[0] = rg;
argb[1] = ba;
#endif
}
//-----------------------------------------------------------------------------
// Alpha handling variants
static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const argb) {
argb[0] = 0xff;
VP8YuvToRgb(y, u, v, argb + 1);
}
static WEBP_INLINE void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const bgra) {
VP8YuvToBgr(y, u, v, bgra);
bgra[3] = 0xff;
}
static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
uint8_t* const rgba) {
VP8YuvToRgb(y, u, v, rgba);
rgba[3] = 0xff;
}
//-----------------------------------------------------------------------------
// SSE2 extra functions (mostly for upsampling_sse2.c)
#if defined(WEBP_USE_SSE2)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst);
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
#endif // WEBP_USE_SSE2
//-----------------------------------------------------------------------------
// SSE41 extra functions (mostly for upsampling_sse41.c)
#if defined(WEBP_USE_SSE41)
// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst);
#endif // WEBP_USE_SSE41
//------------------------------------------------------------------------------
// RGB -> YUV conversion
// Stub functions that can be called with various rounding values:
static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
uv = (uv + rounding + (128 << (YUV_FIX + 2))) >> (YUV_FIX + 2);
return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
}
static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
const int luma = 16839 * r + 33059 * g + 6420 * b;
return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX; // no need to clip
}
static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
const int u = -9719 * r - 19081 * g + 28800 * b;
return VP8ClipUV(u, rounding);
}
static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
const int v = +28800 * r - 24116 * g - 4684 * b;
return VP8ClipUV(v, rounding);
}
#ifdef __cplusplus
} // extern "C"
#endif
#endif // WEBP_DSP_YUV_H_

View File

@ -0,0 +1,103 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS version of YUV to RGB upsampling functions.
//
// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
// Jovan Zelincevic (jovan.zelincevic@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS32)
#include "dsp_yuv.h"
//------------------------------------------------------------------------------
// simple point-sampling
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i, r, g, b; \
int temp0, temp1, temp2, temp3, temp4; \
for (i = 0; i < (len >> 1); i++) { \
temp1 = MultHi(v[0], 26149); \
temp3 = MultHi(v[0], 13320); \
temp2 = MultHi(u[0], 6419); \
temp4 = MultHi(u[0], 33050); \
temp0 = MultHi(y[0], 19077); \
temp1 -= 14234; \
temp3 -= 8708; \
temp2 += temp3; \
temp4 -= 17685; \
r = VP8Clip8(temp0 + temp1); \
g = VP8Clip8(temp0 - temp2); \
b = VP8Clip8(temp0 + temp4); \
temp0 = MultHi(y[1], 19077); \
dst[R] = r; \
dst[G] = g; \
dst[B] = b; \
if (A) dst[A] = 0xff; \
r = VP8Clip8(temp0 + temp1); \
g = VP8Clip8(temp0 - temp2); \
b = VP8Clip8(temp0 + temp4); \
dst[R + XSTEP] = r; \
dst[G + XSTEP] = g; \
dst[B + XSTEP] = b; \
if (A) dst[A + XSTEP] = 0xff; \
y += 2; \
++u; \
++v; \
dst += 2 * XSTEP; \
} \
if (len & 1) { \
temp1 = MultHi(v[0], 26149); \
temp3 = MultHi(v[0], 13320); \
temp2 = MultHi(u[0], 6419); \
temp4 = MultHi(u[0], 33050); \
temp0 = MultHi(y[0], 19077); \
temp1 -= 14234; \
temp3 -= 8708; \
temp2 += temp3; \
temp4 -= 17685; \
r = VP8Clip8(temp0 + temp1); \
g = VP8Clip8(temp0 - temp2); \
b = VP8Clip8(temp0 + temp4); \
dst[R] = r; \
dst[G] = g; \
dst[B] = b; \
if (A) dst[A] = 0xff; \
} \
}
ROW_FUNC(YuvToRgbRow_MIPS32, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow_MIPS32, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow_MIPS32, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow_MIPS32, 4, 2, 1, 0, 3)
#undef ROW_FUNC
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitSamplersMIPS32(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPS32;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPS32;
WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPS32;
WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPS32;
}
#else // !WEBP_USE_MIPS32
WEBP_DSP_INIT_STUB(WebPInitSamplersMIPS32)
#endif // WEBP_USE_MIPS32

View File

@ -0,0 +1,134 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// MIPS DSPr2 version of YUV to RGB upsampling functions.
//
// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
// Djordje Pesut (djordje.pesut@imgtec.com)
#include "dsp_dsp.h"
#if defined(WEBP_USE_MIPS_DSP_R2)
#include "dsp_yuv.h"
//------------------------------------------------------------------------------
// simple point-sampling
#define ROW_FUNC_PART_1() \
"lbu %[temp3], 0(%[v]) \n\t" \
"lbu %[temp4], 0(%[u]) \n\t" \
"lbu %[temp0], 0(%[y]) \n\t" \
"mul %[temp1], %[t_con_1], %[temp3] \n\t" \
"mul %[temp3], %[t_con_2], %[temp3] \n\t" \
"mul %[temp2], %[t_con_3], %[temp4] \n\t" \
"mul %[temp4], %[t_con_4], %[temp4] \n\t" \
"mul %[temp0], %[t_con_5], %[temp0] \n\t" \
"subu %[temp1], %[temp1], %[t_con_6] \n\t" \
"subu %[temp3], %[temp3], %[t_con_7] \n\t" \
"addu %[temp2], %[temp2], %[temp3] \n\t" \
"subu %[temp4], %[temp4], %[t_con_8] \n\t" \
#define ROW_FUNC_PART_2(R, G, B, K) \
"addu %[temp5], %[temp0], %[temp1] \n\t" \
"subu %[temp6], %[temp0], %[temp2] \n\t" \
"addu %[temp7], %[temp0], %[temp4] \n\t" \
".if " #K " \n\t" \
"lbu %[temp0], 1(%[y]) \n\t" \
".endif \n\t" \
"shll_s.w %[temp5], %[temp5], 17 \n\t" \
"shll_s.w %[temp6], %[temp6], 17 \n\t" \
".if " #K " \n\t" \
"mul %[temp0], %[t_con_5], %[temp0] \n\t" \
".endif \n\t" \
"shll_s.w %[temp7], %[temp7], 17 \n\t" \
"precrqu_s.qb.ph %[temp5], %[temp5], $zero \n\t" \
"precrqu_s.qb.ph %[temp6], %[temp6], $zero \n\t" \
"precrqu_s.qb.ph %[temp7], %[temp7], $zero \n\t" \
"srl %[temp5], %[temp5], 24 \n\t" \
"srl %[temp6], %[temp6], 24 \n\t" \
"srl %[temp7], %[temp7], 24 \n\t" \
"sb %[temp5], " #R "(%[dst]) \n\t" \
"sb %[temp6], " #G "(%[dst]) \n\t" \
"sb %[temp7], " #B "(%[dst]) \n\t" \
#define ASM_CLOBBER_LIST() \
: [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \
[temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \
[temp6]"=&r"(temp6), [temp7]"=&r"(temp7) \
: [t_con_1]"r"(t_con_1), [t_con_2]"r"(t_con_2), [t_con_3]"r"(t_con_3), \
[t_con_4]"r"(t_con_4), [t_con_5]"r"(t_con_5), [t_con_6]"r"(t_con_6), \
[u]"r"(u), [v]"r"(v), [y]"r"(y), [dst]"r"(dst), \
[t_con_7]"r"(t_con_7), [t_con_8]"r"(t_con_8) \
: "memory", "hi", "lo" \
#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A) \
static void FUNC_NAME(const uint8_t* y, \
const uint8_t* u, const uint8_t* v, \
uint8_t* dst, int len) { \
int i; \
uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \
const int t_con_1 = 26149; \
const int t_con_2 = 13320; \
const int t_con_3 = 6419; \
const int t_con_4 = 33050; \
const int t_con_5 = 19077; \
const int t_con_6 = 14234; \
const int t_con_7 = 8708; \
const int t_con_8 = 17685; \
for (i = 0; i < (len >> 1); i++) { \
__asm__ volatile ( \
ROW_FUNC_PART_1() \
ROW_FUNC_PART_2(R, G, B, 1) \
ROW_FUNC_PART_2(R + XSTEP, G + XSTEP, B + XSTEP, 0) \
ASM_CLOBBER_LIST() \
); \
if (A) dst[A] = dst[A + XSTEP] = 0xff; \
y += 2; \
++u; \
++v; \
dst += 2 * XSTEP; \
} \
if (len & 1) { \
__asm__ volatile ( \
ROW_FUNC_PART_1() \
ROW_FUNC_PART_2(R, G, B, 0) \
ASM_CLOBBER_LIST() \
); \
if (A) dst[A] = 0xff; \
} \
}
ROW_FUNC(YuvToRgbRow_MIPSdspR2, 3, 0, 1, 2, 0)
ROW_FUNC(YuvToRgbaRow_MIPSdspR2, 4, 0, 1, 2, 3)
ROW_FUNC(YuvToBgrRow_MIPSdspR2, 3, 2, 1, 0, 0)
ROW_FUNC(YuvToBgraRow_MIPSdspR2, 4, 2, 1, 0, 3)
#undef ROW_FUNC
#undef ASM_CLOBBER_LIST
#undef ROW_FUNC_PART_2
#undef ROW_FUNC_PART_1
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitSamplersMIPSdspR2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow_MIPSdspR2;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_MIPSdspR2;
WebPSamplers[MODE_BGR] = YuvToBgrRow_MIPSdspR2;
WebPSamplers[MODE_BGRA] = YuvToBgraRow_MIPSdspR2;
}
#else // !WEBP_USE_MIPS_DSP_R2
WEBP_DSP_INIT_STUB(WebPInitSamplersMIPSdspR2)
#endif // WEBP_USE_MIPS_DSP_R2

View File

@ -0,0 +1,288 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV->RGB conversion functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_yuv.h"
#if defined(WEBP_USE_NEON)
#include <assert.h>
#include <stdlib.h>
#include "dsp_neon.h"
//-----------------------------------------------------------------------------
static uint8x8_t ConvertRGBToY_NEON(const uint8x8_t R,
const uint8x8_t G,
const uint8x8_t B) {
const uint16x8_t r = vmovl_u8(R);
const uint16x8_t g = vmovl_u8(G);
const uint16x8_t b = vmovl_u8(B);
const uint16x4_t r_lo = vget_low_u16(r);
const uint16x4_t r_hi = vget_high_u16(r);
const uint16x4_t g_lo = vget_low_u16(g);
const uint16x4_t g_hi = vget_high_u16(g);
const uint16x4_t b_lo = vget_low_u16(b);
const uint16x4_t b_hi = vget_high_u16(b);
const uint32x4_t tmp0_lo = vmull_n_u16( r_lo, 16839u);
const uint32x4_t tmp0_hi = vmull_n_u16( r_hi, 16839u);
const uint32x4_t tmp1_lo = vmlal_n_u16(tmp0_lo, g_lo, 33059u);
const uint32x4_t tmp1_hi = vmlal_n_u16(tmp0_hi, g_hi, 33059u);
const uint32x4_t tmp2_lo = vmlal_n_u16(tmp1_lo, b_lo, 6420u);
const uint32x4_t tmp2_hi = vmlal_n_u16(tmp1_hi, b_hi, 6420u);
const uint16x8_t Y1 = vcombine_u16(vrshrn_n_u32(tmp2_lo, 16),
vrshrn_n_u32(tmp2_hi, 16));
const uint16x8_t Y2 = vaddq_u16(Y1, vdupq_n_u16(16));
return vqmovn_u16(Y2);
}
static void ConvertRGB24ToY_NEON(const uint8_t* rgb, uint8_t* y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 3 * 8) {
const uint8x8x3_t RGB = vld3_u8(rgb);
const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[0], RGB.val[1], RGB.val[2]);
vst1_u8(y + i, Y);
}
for (; i < width; ++i, rgb += 3) { // left-over
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY_NEON(const uint8_t* bgr, uint8_t* y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, bgr += 3 * 8) {
const uint8x8x3_t BGR = vld3_u8(bgr);
const uint8x8_t Y = ConvertRGBToY_NEON(BGR.val[2], BGR.val[1], BGR.val[0]);
vst1_u8(y + i, Y);
}
for (; i < width; ++i, bgr += 3) { // left-over
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
}
}
static void ConvertARGBToY_NEON(const uint32_t* argb, uint8_t* y, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8) {
const uint8x8x4_t RGB = vld4_u8((const uint8_t*)&argb[i]);
const uint8x8_t Y = ConvertRGBToY_NEON(RGB.val[2], RGB.val[1], RGB.val[0]);
vst1_u8(y + i, Y);
}
for (; i < width; ++i) { // left-over
const uint32_t p = argb[i];
y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
YUV_HALF);
}
}
//-----------------------------------------------------------------------------
// computes: DST_s16 = [(C0 * r + C1 * g + C2 * b) >> 16] + CST
#define MULTIPLY_16b_PREAMBLE(r, g, b) \
const int16x4_t r_lo = vreinterpret_s16_u16(vget_low_u16(r)); \
const int16x4_t r_hi = vreinterpret_s16_u16(vget_high_u16(r)); \
const int16x4_t g_lo = vreinterpret_s16_u16(vget_low_u16(g)); \
const int16x4_t g_hi = vreinterpret_s16_u16(vget_high_u16(g)); \
const int16x4_t b_lo = vreinterpret_s16_u16(vget_low_u16(b)); \
const int16x4_t b_hi = vreinterpret_s16_u16(vget_high_u16(b))
#define MULTIPLY_16b(C0, C1, C2, CST, DST_s16) do { \
const int32x4_t tmp0_lo = vmull_n_s16( r_lo, C0); \
const int32x4_t tmp0_hi = vmull_n_s16( r_hi, C0); \
const int32x4_t tmp1_lo = vmlal_n_s16(tmp0_lo, g_lo, C1); \
const int32x4_t tmp1_hi = vmlal_n_s16(tmp0_hi, g_hi, C1); \
const int32x4_t tmp2_lo = vmlal_n_s16(tmp1_lo, b_lo, C2); \
const int32x4_t tmp2_hi = vmlal_n_s16(tmp1_hi, b_hi, C2); \
const int16x8_t tmp3 = vcombine_s16(vshrn_n_s32(tmp2_lo, 16), \
vshrn_n_s32(tmp2_hi, 16)); \
DST_s16 = vaddq_s16(tmp3, vdupq_n_s16(CST)); \
} while (0)
// This needs to be a macro, since (128 << SHIFT) needs to be an immediate.
#define CONVERT_RGB_TO_UV(r, g, b, SHIFT, U_DST, V_DST) do { \
MULTIPLY_16b_PREAMBLE(r, g, b); \
MULTIPLY_16b(-9719, -19081, 28800, 128 << SHIFT, U_DST); \
MULTIPLY_16b(28800, -24116, -4684, 128 << SHIFT, V_DST); \
} while (0)
static void ConvertRGBA32ToUV_NEON(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
int i;
for (i = 0; i + 8 <= width; i += 8, rgb += 4 * 8) {
const uint16x8x4_t RGB = vld4q_u16((const uint16_t*)rgb);
int16x8_t U, V;
CONVERT_RGB_TO_UV(RGB.val[0], RGB.val[1], RGB.val[2], 2, U, V);
vst1_u8(u + i, vqrshrun_n_s16(U, 2));
vst1_u8(v + i, vqrshrun_n_s16(V, 2));
}
for (; i < width; i += 1, rgb += 4) {
const int r = rgb[0], g = rgb[1], b = rgb[2];
u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
}
}
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
int src_width, int do_store) {
int i;
for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
const uint16x8_t R = vpaddlq_u8(RGB.val[2]); // pair-wise adds
const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
int16x8_t U_tmp, V_tmp;
CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
{
const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
if (do_store) {
vst1_u8(u, U);
vst1_u8(v, V);
} else {
const uint8x8_t prev_u = vld1_u8(u);
const uint8x8_t prev_v = vld1_u8(v);
vst1_u8(u, vrhadd_u8(U, prev_u));
vst1_u8(v, vrhadd_u8(V, prev_v));
}
}
}
if (i < src_width) { // left-over
WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
}
}
//------------------------------------------------------------------------------
extern void WebPInitConvertARGBToYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVNEON(void) {
WebPConvertRGB24ToY = ConvertRGB24ToY_NEON;
WebPConvertBGR24ToY = ConvertBGR24ToY_NEON;
WebPConvertARGBToY = ConvertARGBToY_NEON;
WebPConvertARGBToUV = ConvertARGBToUV_NEON;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_NEON;
}
//------------------------------------------------------------------------------
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y_NEON(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
int i;
const int16x8_t zero = vdupq_n_s16(0);
const int16x8_t max = vdupq_n_s16(MAX_Y);
uint64x2_t sum = vdupq_n_u64(0);
uint64_t diff;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i));
const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i));
const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i));
const int16x8_t D = vsubq_s16(A, B); // diff_y
const int16x8_t F = vaddq_s16(C, D); // new_y
const uint16x8_t H =
vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero));
const int16x8_t I = vabsq_s16(D); // abs(diff_y)
vst1q_u16(dst + i, H);
sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I)));
}
diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1);
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)(dst[i]) + diff_y;
dst[i] = clip_y_NEON(new_y);
diff += (uint64_t)(abs(diff_y));
}
return diff;
}
static void SharpYUVUpdateRGB_NEON(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i;
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t A = vld1q_s16(ref + i);
const int16x8_t B = vld1q_s16(src + i);
const int16x8_t C = vld1q_s16(dst + i);
const int16x8_t D = vsubq_s16(A, B); // diff_uv
const int16x8_t E = vaddq_s16(C, D); // new_uv
vst1q_s16(dst + i, E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
const int16x8_t max = vdupq_n_s16(MAX_Y);
const int16x8_t zero = vdupq_n_s16(0);
for (i = 0; i + 8 <= len; i += 8) {
const int16x8_t a0 = vld1q_s16(A + i + 0);
const int16x8_t a1 = vld1q_s16(A + i + 1);
const int16x8_t b0 = vld1q_s16(B + i + 0);
const int16x8_t b1 = vld1q_s16(B + i + 1);
const int16x8_t a0b1 = vaddq_s16(a0, b1);
const int16x8_t a1b0 = vaddq_s16(a1, b0);
const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1
const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1)
const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0)
const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3);
const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3);
const int16x8_t d0 = vaddq_s16(c1, a0);
const int16x8_t d1 = vaddq_s16(c0, a1);
const int16x8_t e0 = vrshrq_n_s16(d0, 1);
const int16x8_t e1 = vrshrq_n_s16(d1, 1);
const int16x8x2_t f = vzipq_s16(e0, e1);
const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0));
const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8));
const int16x8_t h0 = vaddq_s16(g0, f.val[0]);
const int16x8_t h1 = vaddq_s16(g1, f.val[1]);
const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero);
const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero);
vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0));
vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1));
}
for (; i < len; ++i) {
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_y_NEON(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y_NEON(best_y[2 * i + 1] + v1);
}
}
#undef MAX_Y
//------------------------------------------------------------------------------
extern void WebPInitSharpYUVNEON(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVNEON(void) {
WebPSharpYUVUpdateY = SharpYUVUpdateY_NEON;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_NEON;
WebPSharpYUVFilterRow = SharpYUVFilterRow_NEON;
}
#else // !WEBP_USE_NEON
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVNEON)
WEBP_DSP_INIT_STUB(WebPInitSharpYUVNEON)
#endif // WEBP_USE_NEON

View File

@ -0,0 +1,874 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV->RGB conversion functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_yuv.h"
#if defined(WEBP_USE_SSE2)
#include "dsp_common_sse2.h"
#include <stdlib.h>
#include <emmintrin.h>
//-----------------------------------------------------------------------------
// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y + 26149 * v - 14234) >> 6
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
// B = (19077 * y + 33050 * u - 17685) >> 6
static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
const __m128i* const U0,
const __m128i* const V0,
__m128i* const R,
__m128i* const G,
__m128i* const B) {
const __m128i k19077 = _mm_set1_epi16(19077);
const __m128i k26149 = _mm_set1_epi16(26149);
const __m128i k14234 = _mm_set1_epi16(14234);
// 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
const __m128i k33050 = _mm_set1_epi16((short)33050);
const __m128i k17685 = _mm_set1_epi16(17685);
const __m128i k6419 = _mm_set1_epi16(6419);
const __m128i k13320 = _mm_set1_epi16(13320);
const __m128i k8708 = _mm_set1_epi16(8708);
const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
const __m128i R1 = _mm_sub_epi16(Y1, k14234);
const __m128i R2 = _mm_add_epi16(R1, R0);
const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
const __m128i G2 = _mm_add_epi16(Y1, k8708);
const __m128i G3 = _mm_add_epi16(G0, G1);
const __m128i G4 = _mm_sub_epi16(G2, G3);
// be careful with the saturated *unsigned* arithmetic here!
const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
const __m128i B1 = _mm_adds_epu16(B0, Y1);
const __m128i B2 = _mm_subs_epu16(B1, k17685);
// use logical shift for B2, which can be larger than 32767
*R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815]
*G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710]
*B = _mm_srli_epi16(B2, 6); // range: [0, 34238]
}
// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128();
return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
}
// Load and replicate the U/V samples
static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128();
const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples
}
// Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB_SSE2(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
V0 = Load_HI_16_SSE2(v);
ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
}
// Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB_SSE2(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
V0 = Load_UV_HI_8_SSE2(v);
ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
}
// Pack R/G/B/A results into 32b output.
static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
const __m128i* const A,
uint8_t* const dst) {
const __m128i rb = _mm_packus_epi16(*R, *B);
const __m128i ga = _mm_packus_epi16(*G, *A);
const __m128i rg = _mm_unpacklo_epi8(rb, ga);
const __m128i ba = _mm_unpackhi_epi8(rb, ga);
const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
_mm_storeu_si128((__m128i*)(dst + 0), RGBA_lo);
_mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi);
}
// Pack R/G/B/A results into 16b output.
static WEBP_INLINE void PackAndStore4444_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
const __m128i* const A,
uint8_t* const dst) {
#if (WEBP_SWAP_16BIT_CSP == 0)
const __m128i rg0 = _mm_packus_epi16(*R, *G);
const __m128i ba0 = _mm_packus_epi16(*B, *A);
#else
const __m128i rg0 = _mm_packus_epi16(*B, *A);
const __m128i ba0 = _mm_packus_epi16(*R, *G);
#endif
const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb...
const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga...
const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4);
const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
_mm_storeu_si128((__m128i*)dst, rgba4444);
}
// Pack R/G/B results into 16b output.
static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
uint8_t* const dst) {
const __m128i r0 = _mm_packus_epi16(*R, *R);
const __m128i g0 = _mm_packus_epi16(*G, *G);
const __m128i b0 = _mm_packus_epi16(*B, *B);
const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8));
const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5);
const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
const __m128i rg = _mm_or_si128(r1, g1);
const __m128i gb = _mm_or_si128(g2, b1);
#if (WEBP_SWAP_16BIT_CSP == 0)
const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
#else
const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
#endif
_mm_storeu_si128((__m128i*)dst, rgb565);
}
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
__m128i* const in2, __m128i* const in3,
__m128i* const in4, __m128i* const in5,
uint8_t* const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
// around as follows:
// Input:
// r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
// Split the 6 registers in two sets of 3 registers: the first set as the even
// 8b bytes, the second the odd ones:
// r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
// Repeat the same permutations twice more:
// r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
// r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);
_mm_storeu_si128((__m128i*)(rgb + 0), *in0);
_mm_storeu_si128((__m128i*)(rgb + 16), *in1);
_mm_storeu_si128((__m128i*)(rgb + 32), *in2);
_mm_storeu_si128((__m128i*)(rgb + 48), *in3);
_mm_storeu_si128((__m128i*)(rgb + 64), *in4);
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
}
void VP8YuvToRgba32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B;
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
}
}
void VP8YuvToBgra32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B;
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
}
}
void VP8YuvToArgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 32) {
__m128i R, G, B;
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
}
}
void VP8YuvToRgba444432_SSE2(const uint8_t* y, const uint8_t* u,
const uint8_t* v, uint8_t* dst) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B;
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
}
}
void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
int n;
for (n = 0; n < 32; n += 8, dst += 16) {
__m128i R, G, B;
YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
PackAndStore565_SSE2(&R, &G, &B, dst);
}
}
void VP8YuvToRgb32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1);
rgb1 = _mm_packus_epi16(R2, R3);
rgb2 = _mm_packus_epi16(G0, G1);
rgb3 = _mm_packus_epi16(G2, G3);
rgb4 = _mm_packus_epi16(B0, B1);
rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB.
PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
}
void VP8YuvToBgr32_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1);
bgr1 = _mm_packus_epi16(B2, B3);
bgr2 = _mm_packus_epi16(G0, G1);
bgr3 = _mm_packus_epi16(G2, G3);
bgr4 = _mm_packus_epi16(R0, R1);
bgr5= _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR.
PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
}
//-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions
static void YuvToRgbaRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B;
YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
y += 8;
u += 4;
v += 4;
}
for (; n < len; ++n) { // Finish off
VP8YuvToRgba(y[0], u[0], v[0], dst);
dst += 4;
y += 1;
u += (n & 1);
v += (n & 1);
}
}
static void YuvToBgraRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B;
YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
y += 8;
u += 4;
v += 4;
}
for (; n < len; ++n) { // Finish off
VP8YuvToBgra(y[0], u[0], v[0], dst);
dst += 4;
y += 1;
u += (n & 1);
v += (n & 1);
}
}
static void YuvToArgbRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
const __m128i kAlpha = _mm_set1_epi16(255);
int n;
for (n = 0; n + 8 <= len; n += 8, dst += 32) {
__m128i R, G, B;
YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
y += 8;
u += 4;
v += 4;
}
for (; n < len; ++n) { // Finish off
VP8YuvToArgb(y[0], u[0], v[0], dst);
dst += 4;
y += 1;
u += (n & 1);
v += (n & 1);
}
}
static void YuvToRgbRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1);
rgb1 = _mm_packus_epi16(R2, R3);
rgb2 = _mm_packus_epi16(G0, G1);
rgb3 = _mm_packus_epi16(G2, G3);
rgb4 = _mm_packus_epi16(B0, B1);
rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB.
PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
y += 32;
u += 16;
v += 16;
}
for (; n < len; ++n) { // Finish off
VP8YuvToRgb(y[0], u[0], v[0], dst);
dst += 3;
y += 1;
u += (n & 1);
v += (n & 1);
}
}
static void YuvToBgrRow_SSE2(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1);
bgr1 = _mm_packus_epi16(B2, B3);
bgr2 = _mm_packus_epi16(G0, G1);
bgr3 = _mm_packus_epi16(G2, G3);
bgr4 = _mm_packus_epi16(R0, R1);
bgr5 = _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR.
PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
y += 32;
u += 16;
v += 16;
}
for (; n < len; ++n) { // Finish off
VP8YuvToBgr(y[0], u[0], v[0], dst);
dst += 3;
y += 1;
u += (n & 1);
v += (n & 1);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitSamplersSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2;
WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2;
WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
}
//------------------------------------------------------------------------------
// RGB24/32 -> YUV converters
// Load eight 16b-words from *src.
#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
// Store either 16b-words into *dst
#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
// Function that inserts a value of the second half of the in buffer in between
// every two char of the first half.
static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
out[0] = _mm_unpacklo_epi8(in[0], in[3]);
out[1] = _mm_unpackhi_epi8(in[0], in[3]);
out[2] = _mm_unpacklo_epi8(in[1], in[4]);
out[3] = _mm_unpackhi_epi8(in[1], in[4]);
out[4] = _mm_unpacklo_epi8(in[2], in[5]);
out[5] = _mm_unpackhi_epi8(in[2], in[5]);
}
// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
__m128i tmp[6];
tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0));
tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32));
tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48));
tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
RGB24PackedToPlanarHelper_SSE2(tmp, out);
RGB24PackedToPlanarHelper_SSE2(out, tmp);
RGB24PackedToPlanarHelper_SSE2(tmp, out);
RGB24PackedToPlanarHelper_SSE2(out, tmp);
RGB24PackedToPlanarHelper_SSE2(tmp, out);
}
// Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
__m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4);
__m128i a2 = LOAD_16(argb + 8);
__m128i a3 = LOAD_16(argb + 12);
VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
rgb[0] = _mm_unpacklo_epi8(a1, zero);
rgb[1] = _mm_unpackhi_epi8(a1, zero);
rgb[2] = _mm_unpacklo_epi8(a2, zero);
rgb[3] = _mm_unpackhi_epi8(a2, zero);
rgb[4] = _mm_unpacklo_epi8(a3, zero);
rgb[5] = _mm_unpackhi_epi8(a3, zero);
}
// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
// It's a macro and not a function because we need to use immediate values with
// srai_epi32, e.g.
#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
ROUNDER, DESCALE_FIX, OUT) do { \
const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
(OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
} while (0)
#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const Y) {
const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
const __m128i kGB_y = MK_CST_16(16384, 6420);
const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
}
static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const U,
__m128i* const V) {
const __m128i kRG_u = MK_CST_16(-9719, -19081);
const __m128i kGB_u = MK_CST_16(0, 28800);
const __m128i kRG_v = MK_CST_16(28800, 0);
const __m128i kGB_v = MK_CST_16(-24116, -4684);
const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
kHALF_UV, YUV_FIX + 2, *U);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
kHALF_UV, YUV_FIX + 2, *V);
}
#undef MK_CST_16
#undef TRANSFORM
static void ConvertRGB24ToY_SSE2(const uint8_t* rgb, uint8_t* y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
__m128i rgb_plane[6];
int j;
RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128();
__m128i r, g, b, Y0, Y1;
// Convert to 16-bit Y.
r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
}
for (; i < width; ++i, rgb += 3) { // left-over
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY_SSE2(const uint8_t* bgr, uint8_t* y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
__m128i bgr_plane[6];
int j;
RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128();
__m128i r, g, b, Y0, Y1;
// Convert to 16-bit Y.
b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
}
for (; i < width; ++i, bgr += 3) { // left-over
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
}
}
static void ConvertARGBToY_SSE2(const uint32_t* argb, uint8_t* y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16) {
__m128i Y0, Y1, rgb[6];
RGB32PackedToPlanar_SSE2(&argb[i], rgb);
ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
for (; i < width; ++i) { // left-over
const uint32_t p = argb[i];
y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
YUV_HALF);
}
}
// Horizontal add (doubled) of two 16b values, result is 16b.
// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
static void HorizontalAddPack_SSE2(const __m128i* const A,
const __m128i* const B,
__m128i* const out) {
const __m128i k2 = _mm_set1_epi16(2);
const __m128i C = _mm_madd_epi16(*A, k2);
const __m128i D = _mm_madd_epi16(*B, k2);
*out = _mm_packs_epi32(C, D);
}
static void ConvertARGBToUV_SSE2(const uint32_t* argb,
uint8_t* u, uint8_t* v,
int src_width, int do_store) {
const int max_width = src_width & ~31;
int i;
for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
__m128i rgb[6], U0, V0, U1, V1;
RGB32PackedToPlanar_SSE2(&argb[i], rgb);
HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
U0 = _mm_packus_epi16(U0, U1);
V0 = _mm_packus_epi16(V0, V1);
if (!do_store) {
const __m128i prev_u = LOAD_16(u);
const __m128i prev_v = LOAD_16(v);
U0 = _mm_avg_epu8(U0, prev_u);
V0 = _mm_avg_epu8(V0, prev_v);
}
STORE_16(U0, u);
STORE_16(V0, v);
}
if (i < src_width) { // left-over
WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
}
}
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
const uint16_t* const rgbx,
__m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ...
// column-wise transpose
const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // r0 r1 r2 r3 | g0 g1 ..
const __m128i B1 = _mm_unpackhi_epi16(A0, A1); // b0 b1 b2 b3 | x x x x
const __m128i B2 = _mm_unpacklo_epi16(A2, A3); // r4 r5 r6 r7 | g4 g5 ..
const __m128i B3 = _mm_unpackhi_epi16(A2, A3); // b4 b5 b6 b7 | x x x x
*r = _mm_unpacklo_epi64(B0, B2);
*g = _mm_unpackhi_epi64(B0, B2);
*b = _mm_unpacklo_epi64(B1, B3);
}
static void ConvertRGBA32ToUV_SSE2(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) {
__m128i r, g, b, U0, V0, U1, V1;
RGBA32PackedToPlanar_16b_SSE2(rgb + 0, &r, &g, &b);
ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
STORE_16(_mm_packus_epi16(U0, U1), u);
STORE_16(_mm_packus_epi16(V0, V1), v);
u += 16;
v += 16;
rgb += 2 * 32;
}
if (max_width < width) { // left-over
WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
}
}
//------------------------------------------------------------------------------
extern void WebPInitConvertARGBToYUVSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
WebPConvertARGBToY = ConvertARGBToY_SSE2;
WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
}
//------------------------------------------------------------------------------
#define MAX_Y ((1 << 10) - 1) // 10b precision over 16b-arithmetic
static uint16_t clip_y(int v) {
return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
}
static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
uint16_t* dst, int len) {
uint64_t diff = 0;
uint32_t tmp[4];
int i;
const __m128i zero = _mm_setzero_si128();
const __m128i max = _mm_set1_epi16(MAX_Y);
const __m128i one = _mm_set1_epi16(1);
__m128i sum = zero;
for (i = 0; i + 8 <= len; i += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
const __m128i D = _mm_sub_epi16(A, B); // diff_y
const __m128i E = _mm_cmpgt_epi16(zero, D); // sign (-1 or 0)
const __m128i F = _mm_add_epi16(C, D); // new_y
const __m128i G = _mm_or_si128(E, one); // -1 or 1
const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
const __m128i I = _mm_madd_epi16(D, G); // sum(abs(...))
_mm_storeu_si128((__m128i*)(dst + i), H);
sum = _mm_add_epi32(sum, I);
}
_mm_storeu_si128((__m128i*)tmp, sum);
diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
for (; i < len; ++i) {
const int diff_y = ref[i] - src[i];
const int new_y = (int)dst[i] + diff_y;
dst[i] = clip_y(new_y);
diff += (uint64_t)abs(diff_y);
}
return diff;
}
static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
int16_t* dst, int len) {
int i = 0;
for (i = 0; i + 8 <= len; i += 8) {
const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
const __m128i D = _mm_sub_epi16(A, B); // diff_uv
const __m128i E = _mm_add_epi16(C, D); // new_uv
_mm_storeu_si128((__m128i*)(dst + i), E);
}
for (; i < len; ++i) {
const int diff_uv = ref[i] - src[i];
dst[i] += diff_uv;
}
}
static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
const uint16_t* best_y, uint16_t* out) {
int i;
const __m128i kCst8 = _mm_set1_epi16(8);
const __m128i max = _mm_set1_epi16(MAX_Y);
const __m128i zero = _mm_setzero_si128();
for (i = 0; i + 8 <= len; i += 8) {
const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
const __m128i a0b1 = _mm_add_epi16(a0, b1);
const __m128i a1b0 = _mm_add_epi16(a1, b0);
const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0); // A0+A1+B0+B1
const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1); // 2*(A0+B1)
const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0); // 2*(A1+B0)
const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
const __m128i d0 = _mm_add_epi16(c1, a0);
const __m128i d1 = _mm_add_epi16(c0, a1);
const __m128i e0 = _mm_srai_epi16(d0, 1);
const __m128i e1 = _mm_srai_epi16(d1, 1);
const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
const __m128i h0 = _mm_add_epi16(g0, f0);
const __m128i h1 = _mm_add_epi16(g1, f1);
const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
_mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
_mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
}
for (; i < len; ++i) {
// (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
// = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
// We reuse the common sub-expressions.
const int a0b1 = A[i + 0] + B[i + 1];
const int a1b0 = A[i + 1] + B[i + 0];
const int a0a1b0b1 = a0b1 + a1b0 + 8;
const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
}
}
#undef MAX_Y
//------------------------------------------------------------------------------
extern void WebPInitSharpYUVSSE2(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
}
#else // !WEBP_USE_SSE2
WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
#endif // WEBP_USE_SSE2

View File

@ -0,0 +1,613 @@
// Copyright 2014 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// YUV->RGB conversion functions
//
// Author: Skal (pascal.massimino@gmail.com)
#include "dsp_yuv.h"
#if defined(WEBP_USE_SSE41)
#include "dsp_common_sse41.h"
#include <stdlib.h>
#include <smmintrin.h>
//-----------------------------------------------------------------------------
// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y + 26149 * v - 14234) >> 6
// G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6
// B = (19077 * y + 33050 * u - 17685) >> 6
static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
const __m128i* const U0,
const __m128i* const V0,
__m128i* const R,
__m128i* const G,
__m128i* const B) {
const __m128i k19077 = _mm_set1_epi16(19077);
const __m128i k26149 = _mm_set1_epi16(26149);
const __m128i k14234 = _mm_set1_epi16(14234);
// 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
const __m128i k33050 = _mm_set1_epi16((short)33050);
const __m128i k17685 = _mm_set1_epi16(17685);
const __m128i k6419 = _mm_set1_epi16(6419);
const __m128i k13320 = _mm_set1_epi16(13320);
const __m128i k8708 = _mm_set1_epi16(8708);
const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
const __m128i R1 = _mm_sub_epi16(Y1, k14234);
const __m128i R2 = _mm_add_epi16(R1, R0);
const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
const __m128i G2 = _mm_add_epi16(Y1, k8708);
const __m128i G3 = _mm_add_epi16(G0, G1);
const __m128i G4 = _mm_sub_epi16(G2, G3);
// be careful with the saturated *unsigned* arithmetic here!
const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
const __m128i B1 = _mm_adds_epu16(B0, Y1);
const __m128i B2 = _mm_subs_epu16(B1, k17685);
// use logical shift for B2, which can be larger than 32767
*R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815]
*G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710]
*B = _mm_srli_epi16(B2, 6); // range: [0, 34238]
}
// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128();
return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
}
// Load and replicate the U/V samples
static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
const __m128i zero = _mm_setzero_si128();
const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples
}
// Convert 32 samples of YUV444 to R/G/B
static void YUV444ToRGB_SSE41(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
V0 = Load_HI_16_SSE41(v);
ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
}
// Convert 32 samples of YUV420 to R/G/B
static void YUV420ToRGB_SSE41(const uint8_t* const y,
const uint8_t* const u,
const uint8_t* const v,
__m128i* const R, __m128i* const G,
__m128i* const B) {
const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
V0 = Load_UV_HI_8_SSE41(v);
ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
}
// Pack the planar buffers
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
static WEBP_INLINE void PlanarTo24b_SSE41(
__m128i* const in0, __m128i* const in1, __m128i* const in2,
__m128i* const in3, __m128i* const in4, __m128i* const in5,
uint8_t* const rgb) {
// The input is 6 registers of sixteen 8b but for the sake of explanation,
// let's take 6 registers of four 8b values.
// To pack, we will keep taking one every two 8b integer and move it
// around as follows:
// Input:
// r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
// Split the 6 registers in two sets of 3 registers: the first set as the even
// 8b bytes, the second the odd ones:
// r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
// Repeat the same permutations twice more:
// r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
// r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
VP8PlanarTo24b_SSE41(in0, in1, in2, in3, in4, in5);
_mm_storeu_si128((__m128i*)(rgb + 0), *in0);
_mm_storeu_si128((__m128i*)(rgb + 16), *in1);
_mm_storeu_si128((__m128i*)(rgb + 32), *in2);
_mm_storeu_si128((__m128i*)(rgb + 48), *in3);
_mm_storeu_si128((__m128i*)(rgb + 64), *in4);
_mm_storeu_si128((__m128i*)(rgb + 80), *in5);
}
void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1);
rgb1 = _mm_packus_epi16(R2, R3);
rgb2 = _mm_packus_epi16(G0, G1);
rgb3 = _mm_packus_epi16(G2, G3);
rgb4 = _mm_packus_epi16(B0, B1);
rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB.
PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
}
void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
uint8_t* dst) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1);
YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1);
bgr1 = _mm_packus_epi16(B2, B3);
bgr2 = _mm_packus_epi16(G0, G1);
bgr3 = _mm_packus_epi16(G2, G3);
bgr4 = _mm_packus_epi16(R0, R1);
bgr5= _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR.
PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
}
//-----------------------------------------------------------------------------
// Arbitrary-length row conversion functions
static void YuvToRgbRow_SSE41(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
YUV420ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB_SSE41(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB_SSE41(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as RRRRGGGGBBBB.
rgb0 = _mm_packus_epi16(R0, R1);
rgb1 = _mm_packus_epi16(R2, R3);
rgb2 = _mm_packus_epi16(G0, G1);
rgb3 = _mm_packus_epi16(G2, G3);
rgb4 = _mm_packus_epi16(B0, B1);
rgb5 = _mm_packus_epi16(B2, B3);
// Pack as RGBRGBRGBRGB.
PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
y += 32;
u += 16;
v += 16;
}
for (; n < len; ++n) { // Finish off
VP8YuvToRgb(y[0], u[0], v[0], dst);
dst += 3;
y += 1;
u += (n & 1);
v += (n & 1);
}
}
static void YuvToBgrRow_SSE41(const uint8_t* y,
const uint8_t* u, const uint8_t* v,
uint8_t* dst, int len) {
int n;
for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
__m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
__m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
YUV420ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
YUV420ToRGB_SSE41(y + 8, u + 4, v + 4, &R1, &G1, &B1);
YUV420ToRGB_SSE41(y + 16, u + 8, v + 8, &R2, &G2, &B2);
YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
// Cast to 8b and store as BBBBGGGGRRRR.
bgr0 = _mm_packus_epi16(B0, B1);
bgr1 = _mm_packus_epi16(B2, B3);
bgr2 = _mm_packus_epi16(G0, G1);
bgr3 = _mm_packus_epi16(G2, G3);
bgr4 = _mm_packus_epi16(R0, R1);
bgr5 = _mm_packus_epi16(R2, R3);
// Pack as BGRBGRBGRBGR.
PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
y += 32;
u += 16;
v += 16;
}
for (; n < len; ++n) { // Finish off
VP8YuvToBgr(y[0], u[0], v[0], dst);
dst += 3;
y += 1;
u += (n & 1);
v += (n & 1);
}
}
//------------------------------------------------------------------------------
// Entry point
extern void WebPInitSamplersSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE41;
WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE41;
}
//------------------------------------------------------------------------------
// RGB24/32 -> YUV converters
// Load eight 16b-words from *src.
#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
// Store either 16b-words into *dst
#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
#define WEBP_SSE41_SHUFF(OUT) do { \
const __m128i tmp0 = _mm_shuffle_epi8(A0, shuff0); \
const __m128i tmp1 = _mm_shuffle_epi8(A1, shuff1); \
const __m128i tmp2 = _mm_shuffle_epi8(A2, shuff2); \
const __m128i tmp3 = _mm_shuffle_epi8(A3, shuff0); \
const __m128i tmp4 = _mm_shuffle_epi8(A4, shuff1); \
const __m128i tmp5 = _mm_shuffle_epi8(A5, shuff2); \
\
/* OR everything to get one channel */ \
const __m128i tmp6 = _mm_or_si128(tmp0, tmp1); \
const __m128i tmp7 = _mm_or_si128(tmp3, tmp4); \
out[OUT + 0] = _mm_or_si128(tmp6, tmp2); \
out[OUT + 1] = _mm_or_si128(tmp7, tmp5); \
} while (0);
// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
// Similar to PlanarTo24bHelper(), but in reverse order.
static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb + 0));
const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
const __m128i A3 = _mm_loadu_si128((const __m128i*)(rgb + 48));
const __m128i A4 = _mm_loadu_si128((const __m128i*)(rgb + 64));
const __m128i A5 = _mm_loadu_si128((const __m128i*)(rgb + 80));
// Compute RR.
{
const __m128i shuff0 = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0);
const __m128i shuff1 = _mm_set_epi8(
-1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1);
const __m128i shuff2 = _mm_set_epi8(
13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
WEBP_SSE41_SHUFF(0)
}
// Compute GG.
{
const __m128i shuff0 = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1);
const __m128i shuff1 = _mm_set_epi8(
-1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1);
const __m128i shuff2 = _mm_set_epi8(
14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
WEBP_SSE41_SHUFF(2)
}
// Compute BB.
{
const __m128i shuff0 = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2);
const __m128i shuff1 = _mm_set_epi8(
-1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1);
const __m128i shuff2 = _mm_set_epi8(
15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
WEBP_SSE41_SHUFF(4)
}
}
#undef WEBP_SSE41_SHUFF
// Convert 8 packed ARGB to r[], g[], b[]
static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
const uint32_t* const argb, __m128i* const rgb /*in[6]*/) {
const __m128i zero = _mm_setzero_si128();
__m128i a0 = LOAD_16(argb + 0);
__m128i a1 = LOAD_16(argb + 4);
__m128i a2 = LOAD_16(argb + 8);
__m128i a3 = LOAD_16(argb + 12);
VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3);
rgb[0] = _mm_unpacklo_epi8(a1, zero);
rgb[1] = _mm_unpackhi_epi8(a1, zero);
rgb[2] = _mm_unpacklo_epi8(a2, zero);
rgb[3] = _mm_unpackhi_epi8(a2, zero);
rgb[4] = _mm_unpacklo_epi8(a3, zero);
rgb[5] = _mm_unpackhi_epi8(a3, zero);
}
// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
// It's a macro and not a function because we need to use immediate values with
// srai_epi32, e.g.
#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
ROUNDER, DESCALE_FIX, OUT) do { \
const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \
const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \
const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \
const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \
const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \
const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \
const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \
const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \
const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \
const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \
(OUT) = _mm_packs_epi32(V5_lo, V5_hi); \
} while (0)
#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const Y) {
const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
const __m128i kGB_y = MK_CST_16(16384, 6420);
const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
}
static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
const __m128i* const G,
const __m128i* const B,
__m128i* const U,
__m128i* const V) {
const __m128i kRG_u = MK_CST_16(-9719, -19081);
const __m128i kGB_u = MK_CST_16(0, 28800);
const __m128i kRG_v = MK_CST_16(28800, 0);
const __m128i kGB_v = MK_CST_16(-24116, -4684);
const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
kHALF_UV, YUV_FIX + 2, *U);
TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
kHALF_UV, YUV_FIX + 2, *V);
}
#undef MK_CST_16
#undef TRANSFORM
static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
__m128i rgb_plane[6];
int j;
RGB24PackedToPlanar_SSE41(rgb, rgb_plane);
for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128();
__m128i r, g, b, Y0, Y1;
// Convert to 16-bit Y.
r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
}
for (; i < width; ++i, rgb += 3) { // left-over
y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
}
}
static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
const int max_width = width & ~31;
int i;
for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
__m128i bgr_plane[6];
int j;
RGB24PackedToPlanar_SSE41(bgr, bgr_plane);
for (j = 0; j < 2; ++j, i += 16) {
const __m128i zero = _mm_setzero_si128();
__m128i r, g, b, Y0, Y1;
// Convert to 16-bit Y.
b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
// Convert to 16-bit Y.
b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
// Cast to 8-bit and store.
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
}
for (; i < width; ++i, bgr += 3) { // left-over
y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
}
}
static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
const int max_width = width & ~15;
int i;
for (i = 0; i < max_width; i += 16) {
__m128i Y0, Y1, rgb[6];
RGB32PackedToPlanar_SSE41(&argb[i], rgb);
ConvertRGBToY_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0);
ConvertRGBToY_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1);
STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
}
for (; i < width; ++i) { // left-over
const uint32_t p = argb[i];
y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
YUV_HALF);
}
}
// Horizontal add (doubled) of two 16b values, result is 16b.
// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
static void HorizontalAddPack_SSE41(const __m128i* const A,
const __m128i* const B,
__m128i* const out) {
const __m128i k2 = _mm_set1_epi16(2);
const __m128i C = _mm_madd_epi16(*A, k2);
const __m128i D = _mm_madd_epi16(*B, k2);
*out = _mm_packs_epi32(C, D);
}
static void ConvertARGBToUV_SSE41(const uint32_t* argb,
uint8_t* u, uint8_t* v,
int src_width, int do_store) {
const int max_width = src_width & ~31;
int i;
for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
__m128i rgb[6], U0, V0, U1, V1;
RGB32PackedToPlanar_SSE41(&argb[i], rgb);
HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb);
HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
U0 = _mm_packus_epi16(U0, U1);
V0 = _mm_packus_epi16(V0, V1);
if (!do_store) {
const __m128i prev_u = LOAD_16(u);
const __m128i prev_v = LOAD_16(v);
U0 = _mm_avg_epu8(U0, prev_u);
V0 = _mm_avg_epu8(V0, prev_v);
}
STORE_16(U0, u);
STORE_16(V0, v);
}
if (i < src_width) { // left-over
WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
}
}
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
const uint16_t* const rgbx,
__m128i* const r, __m128i* const g, __m128i* const b) {
const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x
const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x
const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ...
const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ...
// aarrggbb as 16-bit.
const __m128i shuff0 =
_mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
const __m128i shuff1 =
_mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0);
const __m128i A0 = _mm_shuffle_epi8(in0, shuff0);
const __m128i A1 = _mm_shuffle_epi8(in1, shuff1);
const __m128i A2 = _mm_shuffle_epi8(in2, shuff0);
const __m128i A3 = _mm_shuffle_epi8(in3, shuff1);
// R0R1G0G1
// B0B1****
// R2R3G2G3
// B2B3****
// (OR is used to free port 5 for the unpack)
const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
const __m128i B1 = _mm_or_si128(A0, A1);
const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
const __m128i B3 = _mm_or_si128(A2, A3);
// Gather the channels.
*r = _mm_unpacklo_epi64(B0, B2);
*g = _mm_unpackhi_epi64(B0, B2);
*b = _mm_unpackhi_epi64(B1, B3);
}
static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
uint8_t* u, uint8_t* v, int width) {
const int max_width = width & ~15;
const uint16_t* const last_rgb = rgb + 4 * max_width;
while (rgb < last_rgb) {
__m128i r, g, b, U0, V0, U1, V1;
RGBA32PackedToPlanar_16b_SSE41(rgb + 0, &r, &g, &b);
ConvertRGBToUV_SSE41(&r, &g, &b, &U0, &V0);
RGBA32PackedToPlanar_16b_SSE41(rgb + 32, &r, &g, &b);
ConvertRGBToUV_SSE41(&r, &g, &b, &U1, &V1);
STORE_16(_mm_packus_epi16(U0, U1), u);
STORE_16(_mm_packus_epi16(V0, V1), v);
u += 16;
v += 16;
rgb += 2 * 32;
}
if (max_width < width) { // left-over
WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
}
}
//------------------------------------------------------------------------------
extern void WebPInitConvertARGBToYUVSSE41(void);
WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) {
WebPConvertARGBToY = ConvertARGBToY_SSE41;
WebPConvertARGBToUV = ConvertARGBToUV_SSE41;
WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41;
WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41;
WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41;
}
//------------------------------------------------------------------------------
#else // !WEBP_USE_SSE41
WEBP_DSP_INIT_STUB(WebPInitSamplersSSE41)
WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE41)
#endif // WEBP_USE_SSE41

View File

@ -0,0 +1,443 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Alpha-plane compression.
//
// Author: Skal (pascal.massimino@gmail.com)
#include <assert.h>
#include <stdlib.h>
#include "enc_vp8i_enc.h"
#include "dsp_dsp.h"
#include "utils_filters_utils.h"
#include "utils_quant_levels_utils.h"
#include "utils_utils.h"
#include "webp_format_constants.h"
// -----------------------------------------------------------------------------
// Encodes the given alpha data via specified compression method 'method'.
// The pre-processing (quantization) is performed if 'quality' is less than 100.
// For such cases, the encoding is lossy. The valid range is [0, 100] for
// 'quality' and [0, 1] for 'method':
// 'method = 0' - No compression;
// 'method = 1' - Use lossless coder on the alpha plane only
// 'filter' values [0, 4] correspond to prediction modes none, horizontal,
// vertical & gradient filters. The prediction mode 4 will try all the
// prediction modes 0 to 3 and pick the best one.
// 'effort_level': specifies how much effort must be spent to try and reduce
// the compressed output size. In range 0 (quick) to 6 (slow).
//
// 'output' corresponds to the buffer containing compressed alpha data.
// This buffer is allocated by this method and caller should call
// WebPSafeFree(*output) when done.
// 'output_size' corresponds to size of this compressed alpha buffer.
//
// Returns 1 on successfully encoding the alpha and
// 0 if either:
// invalid quality or method, or
// memory allocation for the compressed data fails.
#include "enc_vp8li_enc.h"
static int EncodeLossless(const uint8_t* const data, int width, int height,
int effort_level, // in [0..6] range
int use_quality_100, VP8LBitWriter* const bw,
WebPAuxStats* const stats) {
int ok = 0;
WebPConfig config;
WebPPicture picture;
WebPPictureInit(&picture);
picture.width = width;
picture.height = height;
picture.use_argb = 1;
picture.stats = stats;
if (!WebPPictureAlloc(&picture)) return 0;
// Transfer the alpha values to the green channel.
WebPDispatchAlphaToGreen(data, width, picture.width, picture.height,
picture.argb, picture.argb_stride);
WebPConfigInit(&config);
config.lossless = 1;
// Enable exact, or it would alter RGB values of transparent alpha, which is
// normally OK but not here since we are not encoding the input image but an
// internal encoding-related image containing necessary exact information in
// RGB channels.
config.exact = 1;
config.method = effort_level; // impact is very small
// Set a low default quality for encoding alpha. Ensure that Alpha quality at
// lower methods (3 and below) is less than the threshold for triggering
// costly 'BackwardReferencesTraceBackwards'.
// If the alpha quality is set to 100 and the method to 6, allow for a high
// lossless quality to trigger the cruncher.
config.quality =
(use_quality_100 && effort_level == 6) ? 100 : 8.f * effort_level;
assert(config.quality >= 0 && config.quality <= 100.f);
// TODO(urvang): Temporary fix to avoid generating images that trigger
// a decoder bug related to alpha with color cache.
// See: https://code.google.com/p/webp/issues/detail?id=239
// Need to re-enable this later.
ok = (VP8LEncodeStream(&config, &picture, bw, 0 /*use_cache*/) == VP8_ENC_OK);
WebPPictureFree(&picture);
ok = ok && !bw->error_;
if (!ok) {
VP8LBitWriterWipeOut(bw);
return 0;
}
return 1;
}
// -----------------------------------------------------------------------------
// Small struct to hold the result of a filter mode compression attempt.
typedef struct {
size_t score;
VP8BitWriter bw;
WebPAuxStats stats;
} FilterTrial;
// This function always returns an initialized 'bw' object, even upon error.
static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
int method, int filter, int reduce_levels,
int effort_level, // in [0..6] range
uint8_t* const tmp_alpha,
FilterTrial* result) {
int ok = 0;
const uint8_t* alpha_src;
WebPFilterFunc filter_func;
uint8_t header;
const size_t data_size = width * height;
const uint8_t* output = NULL;
size_t output_size = 0;
VP8LBitWriter tmp_bw;
assert((uint64_t)data_size == (uint64_t)width * height); // as per spec
assert(filter >= 0 && filter < WEBP_FILTER_LAST);
assert(method >= ALPHA_NO_COMPRESSION);
assert(method <= ALPHA_LOSSLESS_COMPRESSION);
assert(sizeof(header) == ALPHA_HEADER_LEN);
filter_func = WebPFilters[filter];
if (filter_func != NULL) {
filter_func(data, width, height, width, tmp_alpha);
alpha_src = tmp_alpha;
} else {
alpha_src = data;
}
if (method != ALPHA_NO_COMPRESSION) {
ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
!reduce_levels, &tmp_bw, &result->stats);
if (ok) {
output = VP8LBitWriterFinish(&tmp_bw);
output_size = VP8LBitWriterNumBytes(&tmp_bw);
if (output_size > data_size) {
// compressed size is larger than source! Revert to uncompressed mode.
method = ALPHA_NO_COMPRESSION;
VP8LBitWriterWipeOut(&tmp_bw);
}
} else {
VP8LBitWriterWipeOut(&tmp_bw);
return 0;
}
}
if (method == ALPHA_NO_COMPRESSION) {
output = alpha_src;
output_size = data_size;
ok = 1;
}
// Emit final result.
header = method | (filter << 2);
if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
if (method != ALPHA_NO_COMPRESSION) {
VP8LBitWriterWipeOut(&tmp_bw);
}
ok = ok && !result->bw.error_;
result->score = VP8BitWriterSize(&result->bw);
return ok;
}
// -----------------------------------------------------------------------------
static int GetNumColors(const uint8_t* data, int width, int height,
int stride) {
int j;
int colors = 0;
uint8_t color[256] = { 0 };
for (j = 0; j < height; ++j) {
int i;
const uint8_t* const p = data + j * stride;
for (i = 0; i < width; ++i) {
color[p[i]] = 1;
}
}
for (j = 0; j < 256; ++j) {
if (color[j] > 0) ++colors;
}
return colors;
}
#define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
#define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
// Given the input 'filter' option, return an OR'd bit-set of filters to try.
static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
int filter, int effort_level) {
uint32_t bit_map = 0U;
if (filter == WEBP_FILTER_FAST) {
// Quick estimate of the best candidate.
int try_filter_none = (effort_level > 3);
const int kMinColorsForFilterNone = 16;
const int kMaxColorsForFilterNone = 192;
const int num_colors = GetNumColors(alpha, width, height, width);
// For low number of colors, NONE yields better compression.
filter = (num_colors <= kMinColorsForFilterNone)
? WEBP_FILTER_NONE
: WebPEstimateBestFilter(alpha, width, height, width);
bit_map |= 1 << filter;
// For large number of colors, try FILTER_NONE in addition to the best
// filter as well.
if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
bit_map |= FILTER_TRY_NONE;
}
} else if (filter == WEBP_FILTER_NONE) {
bit_map = FILTER_TRY_NONE;
} else { // WEBP_FILTER_BEST -> try all
bit_map = FILTER_TRY_ALL;
}
return bit_map;
}
static void InitFilterTrial(FilterTrial* const score) {
score->score = (size_t)~0U;
VP8BitWriterInit(&score->bw, 0);
}
static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
size_t data_size, int method, int filter,
int reduce_levels, int effort_level,
uint8_t** const output,
size_t* const output_size,
WebPAuxStats* const stats) {
int ok = 1;
FilterTrial best;
uint32_t try_map =
GetFilterMap(alpha, width, height, filter, effort_level);
InitFilterTrial(&best);
if (try_map != FILTER_TRY_NONE) {
uint8_t* filtered_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
if (filtered_alpha == NULL) return 0;
for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
if (try_map & 1) {
FilterTrial trial;
ok = EncodeAlphaInternal(alpha, width, height, method, filter,
reduce_levels, effort_level, filtered_alpha,
&trial);
if (ok && trial.score < best.score) {
VP8BitWriterWipeOut(&best.bw);
best = trial;
} else {
VP8BitWriterWipeOut(&trial.bw);
}
}
}
WebPSafeFree(filtered_alpha);
} else {
ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
reduce_levels, effort_level, NULL, &best);
}
if (ok) {
#if !defined(WEBP_DISABLE_STATS)
if (stats != NULL) {
stats->lossless_features = best.stats.lossless_features;
stats->histogram_bits = best.stats.histogram_bits;
stats->transform_bits = best.stats.transform_bits;
stats->cache_bits = best.stats.cache_bits;
stats->palette_size = best.stats.palette_size;
stats->lossless_size = best.stats.lossless_size;
stats->lossless_hdr_size = best.stats.lossless_hdr_size;
stats->lossless_data_size = best.stats.lossless_data_size;
}
#else
(void)stats;
#endif
*output_size = VP8BitWriterSize(&best.bw);
*output = VP8BitWriterBuf(&best.bw);
} else {
VP8BitWriterWipeOut(&best.bw);
}
return ok;
}
static int EncodeAlpha(VP8Encoder* const enc,
int quality, int method, int filter,
int effort_level,
uint8_t** const output, size_t* const output_size) {
const WebPPicture* const pic = enc->pic_;
const int width = pic->width;
const int height = pic->height;
uint8_t* quant_alpha = NULL;
const size_t data_size = width * height;
uint64_t sse = 0;
int ok = 1;
const int reduce_levels = (quality < 100);
// quick sanity checks
assert((uint64_t)data_size == (uint64_t)width * height); // as per spec
assert(enc != NULL && pic != NULL && pic->a != NULL);
assert(output != NULL && output_size != NULL);
assert(width > 0 && height > 0);
assert(pic->a_stride >= width);
assert(filter >= WEBP_FILTER_NONE && filter <= WEBP_FILTER_FAST);
if (quality < 0 || quality > 100) {
return 0;
}
if (method < ALPHA_NO_COMPRESSION || method > ALPHA_LOSSLESS_COMPRESSION) {
return 0;
}
if (method == ALPHA_NO_COMPRESSION) {
// Don't filter, as filtering will make no impact on compressed size.
filter = WEBP_FILTER_NONE;
}
quant_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
if (quant_alpha == NULL) {
return 0;
}
// Extract alpha data (width x height) from raw_data (stride x height).
WebPCopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
if (reduce_levels) { // No Quantization required for 'quality = 100'.
// 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
// mapped to moderate quality 70. Hence Quality:[0, 70] -> Levels:[2, 16]
// and Quality:]70, 100] -> Levels:]16, 256].
const int alpha_levels = (quality <= 70) ? (2 + quality / 5)
: (16 + (quality - 70) * 8);
ok = QuantizeLevels(quant_alpha, width, height, alpha_levels, &sse);
}
if (ok) {
VP8FiltersInit();
ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
filter, reduce_levels, effort_level, output,
output_size, pic->stats);
#if !defined(WEBP_DISABLE_STATS)
if (pic->stats != NULL) { // need stats?
pic->stats->coded_size += (int)(*output_size);
enc->sse_[3] = sse;
}
#endif
}
WebPSafeFree(quant_alpha);
return ok;
}
//------------------------------------------------------------------------------
// Main calls
static int CompressAlphaJob(void* arg1, void* dummy) {
VP8Encoder* const enc = (VP8Encoder*)arg1;
const WebPConfig* config = enc->config_;
uint8_t* alpha_data = NULL;
size_t alpha_size = 0;
const int effort_level = config->method; // maps to [0..6]
const WEBP_FILTER_TYPE filter =
(config->alpha_filtering == 0) ? WEBP_FILTER_NONE :
(config->alpha_filtering == 1) ? WEBP_FILTER_FAST :
WEBP_FILTER_BEST;
if (!EncodeAlpha(enc, config->alpha_quality, config->alpha_compression,
filter, effort_level, &alpha_data, &alpha_size)) {
return 0;
}
if (alpha_size != (uint32_t)alpha_size) { // Sanity check.
WebPSafeFree(alpha_data);
return 0;
}
enc->alpha_data_size_ = (uint32_t)alpha_size;
enc->alpha_data_ = alpha_data;
(void)dummy;
return 1;
}
void VP8EncInitAlpha(VP8Encoder* const enc) {
WebPInitAlphaProcessing();
enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
enc->alpha_data_ = NULL;
enc->alpha_data_size_ = 0;
if (enc->thread_level_ > 0) {
WebPWorker* const worker = &enc->alpha_worker_;
WebPGetWorkerInterface()->Init(worker);
worker->data1 = enc;
worker->data2 = NULL;
worker->hook = CompressAlphaJob;
}
}
int VP8EncStartAlpha(VP8Encoder* const enc) {
if (enc->has_alpha_) {
if (enc->thread_level_ > 0) {
WebPWorker* const worker = &enc->alpha_worker_;
// Makes sure worker is good to go.
if (!WebPGetWorkerInterface()->Reset(worker)) {
return 0;
}
WebPGetWorkerInterface()->Launch(worker);
return 1;
} else {
return CompressAlphaJob(enc, NULL); // just do the job right away
}
}
return 1;
}
int VP8EncFinishAlpha(VP8Encoder* const enc) {
if (enc->has_alpha_) {
if (enc->thread_level_ > 0) {
WebPWorker* const worker = &enc->alpha_worker_;
if (!WebPGetWorkerInterface()->Sync(worker)) return 0; // error
}
}
return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
}
int VP8EncDeleteAlpha(VP8Encoder* const enc) {
int ok = 1;
if (enc->thread_level_ > 0) {
WebPWorker* const worker = &enc->alpha_worker_;
// finish anything left in flight
ok = WebPGetWorkerInterface()->Sync(worker);
// still need to end the worker, even if !ok
WebPGetWorkerInterface()->End(worker);
}
WebPSafeFree(enc->alpha_data_);
enc->alpha_data_ = NULL;
enc->alpha_data_size_ = 0;
enc->has_alpha_ = 0;
return ok;
}

View File

@ -0,0 +1,475 @@
// Copyright 2011 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Macroblock analysis
//
// Author: Skal (pascal.massimino@gmail.com)
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include "enc_vp8i_enc.h"
#include "enc_cost_enc.h"
#include "utils_utils.h"
#define MAX_ITERS_K_MEANS 6
//------------------------------------------------------------------------------
// Smooth the segment map by replacing isolated block by the majority of its
// neighbours.
static void SmoothSegmentMap(VP8Encoder* const enc) {
int n, x, y;
const int w = enc->mb_w_;
const int h = enc->mb_h_;
const int majority_cnt_3_x_3_grid = 5;
uint8_t* const tmp = (uint8_t*)WebPSafeMalloc(w * h, sizeof(*tmp));
assert((uint64_t)(w * h) == (uint64_t)w * h); // no overflow, as per spec
if (tmp == NULL) return;
for (y = 1; y < h - 1; ++y) {
for (x = 1; x < w - 1; ++x) {
int cnt[NUM_MB_SEGMENTS] = { 0 };
const VP8MBInfo* const mb = &enc->mb_info_[x + w * y];
int majority_seg = mb->segment_;
// Check the 8 neighbouring segment values.
cnt[mb[-w - 1].segment_]++; // top-left
cnt[mb[-w + 0].segment_]++; // top
cnt[mb[-w + 1].segment_]++; // top-right
cnt[mb[ - 1].segment_]++; // left
cnt[mb[ + 1].segment_]++; // right
cnt[mb[ w - 1].segment_]++; // bottom-left
cnt[mb[ w + 0].segment_]++; // bottom
cnt[mb[ w + 1].segment_]++; // bottom-right
for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
if (cnt[n] >= majority_cnt_3_x_3_grid) {
majority_seg = n;
break;
}
}
tmp[x + y * w] = majority_seg;
}
}
for (y = 1; y < h - 1; ++y) {
for (x = 1; x < w - 1; ++x) {
VP8MBInfo* const mb = &enc->mb_info_[x + w * y];
mb->segment_ = tmp[x + y * w];
}
}
WebPSafeFree(tmp);
}
//------------------------------------------------------------------------------
// set segment susceptibility alpha_ / beta_
static WEBP_INLINE int clip(int v, int m, int M) {
return (v < m) ? m : (v > M) ? M : v;
}
static void SetSegmentAlphas(VP8Encoder* const enc,
const int centers[NUM_MB_SEGMENTS],
int mid) {
const int nb = enc->segment_hdr_.num_segments_;
int min = centers[0], max = centers[0];
int n;
if (nb > 1) {
for (n = 0; n < nb; ++n) {
if (min > centers[n]) min = centers[n];
if (max < centers[n]) max = centers[n];
}
}
if (max == min) max = min + 1;
assert(mid <= max && mid >= min);
for (n = 0; n < nb; ++n) {
const int alpha = 255 * (centers[n] - mid) / (max - min);
const int beta = 255 * (centers[n] - min) / (max - min);
enc->dqm_[n].alpha_ = clip(alpha, -127, 127);
enc->dqm_[n].beta_ = clip(beta, 0, 255);
}
}
//------------------------------------------------------------------------------
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
#define MAX_ALPHA 255 // 8b of precision for susceptibilities.
#define ALPHA_SCALE (2 * MAX_ALPHA) // scaling factor for alpha.
#define DEFAULT_ALPHA (-1)
#define IS_BETTER_ALPHA(alpha, best_alpha) ((alpha) > (best_alpha))
static int FinalAlphaValue(int alpha) {
alpha = MAX_ALPHA - alpha;
return clip(alpha, 0, MAX_ALPHA);
}
static int GetAlpha(const VP8Histogram* const histo) {
// 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
// values which happen to be mostly noise. This leaves the maximum precision
// for handling the useful small values which contribute most.
const int max_value = histo->max_value;
const int last_non_zero = histo->last_non_zero;
const int alpha =
(max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
return alpha;
}
static void InitHistogram(VP8Histogram* const histo) {
histo->max_value = 0;
histo->last_non_zero = 1;
}
//------------------------------------------------------------------------------
// Simplified k-Means, to assign Nb segments based on alpha-histogram
static void AssignSegments(VP8Encoder* const enc,
const int alphas[MAX_ALPHA + 1]) {
// 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
// explicit check is needed to avoid spurious warning about 'n + 1' exceeding
// array bounds of 'centers' with some compilers (noticed with gcc-4.9).
const int nb = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) ?
enc->segment_hdr_.num_segments_ : NUM_MB_SEGMENTS;
int centers[NUM_MB_SEGMENTS];
int weighted_average = 0;
int map[MAX_ALPHA + 1];
int a, n, k;
int min_a = 0, max_a = MAX_ALPHA, range_a;
// 'int' type is ok for histo, and won't overflow
int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
assert(nb >= 1);
assert(nb <= NUM_MB_SEGMENTS);
// bracket the input
for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
min_a = n;
for (n = MAX_ALPHA; n > min_a && alphas[n] == 0; --n) {}
max_a = n;
range_a = max_a - min_a;
// Spread initial centers evenly
for (k = 0, n = 1; k < nb; ++k, n += 2) {
assert(n < 2 * nb);
centers[k] = min_a + (n * range_a) / (2 * nb);
}
for (k = 0; k < MAX_ITERS_K_MEANS; ++k) { // few iters are enough
int total_weight;
int displaced;
// Reset stats
for (n = 0; n < nb; ++n) {
accum[n] = 0;
dist_accum[n] = 0;
}
// Assign nearest center for each 'a'
n = 0; // track the nearest center for current 'a'
for (a = min_a; a <= max_a; ++a) {
if (alphas[a]) {
while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
n++;
}
map[a] = n;
// accumulate contribution into best centroid
dist_accum[n] += a * alphas[a];
accum[n] += alphas[a];
}
}
// All point are classified. Move the centroids to the
// center of their respective cloud.
displaced = 0;
weighted_average = 0;
total_weight = 0;
for (n = 0; n < nb; ++n) {
if (accum[n]) {
const int new_center = (dist_accum[n] + accum[n] / 2) / accum[n];
displaced += abs(centers[n] - new_center);
centers[n] = new_center;
weighted_average += new_center * accum[n];
total_weight += accum[n];
}
}
weighted_average = (weighted_average + total_weight / 2) / total_weight;
if (displaced < 5) break; // no need to keep on looping...
}
// Map each original value to the closest centroid
for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
VP8MBInfo* const mb = &enc->mb_info_[n];
const int alpha = mb->alpha_;
mb->segment_ = map[alpha];
mb->alpha_ = centers[map[alpha]]; // for the record.
}
if (nb > 1) {
const int smooth = (enc->config_->preprocessing & 1);
if (smooth) SmoothSegmentMap(enc);
}
SetSegmentAlphas(enc, centers, weighted_average); // pick some alphas.
}
//------------------------------------------------------------------------------
// Macroblock analysis: collect histogram for each mode, deduce the maximal
// susceptibility and set best modes for this macroblock.
// Segment assignment is done later.
// Number of modes to inspect for alpha_ evaluation. We don't need to test all
// the possible modes during the analysis phase: we risk falling into a local
// optimum, or be subject to boundary effect
#define MAX_INTRA16_MODE 2
#define MAX_INTRA4_MODE 2
#define MAX_UV_MODE 2
static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
const int max_mode = MAX_INTRA16_MODE;
int mode;
int best_alpha = DEFAULT_ALPHA;
int best_mode = 0;
VP8MakeLuma16Preds(it);
for (mode = 0; mode < max_mode; ++mode) {
VP8Histogram histo;
int alpha;
InitHistogram(&histo);
VP8CollectHistogram(it->yuv_in_ + Y_OFF_ENC,
it->yuv_p_ + VP8I16ModeOffsets[mode],
0, 16, &histo);
alpha = GetAlpha(&histo);
if (IS_BETTER_ALPHA(alpha, best_alpha)) {
best_alpha = alpha;
best_mode = mode;
}
}
VP8SetIntra16Mode(it, best_mode);
return best_alpha;
}
static int FastMBAnalyze(VP8EncIterator* const it) {
// Empirical cut-off value, should be around 16 (~=block size). We use the
// [8-17] range and favor intra4 at high quality, intra16 for low quality.
const int q = (int)it->enc_->config_->quality;
const uint32_t kThreshold = 8 + (17 - 8) * q / 100;
int k;
uint32_t dc[16], m, m2;
for (k = 0; k < 16; k += 4) {
VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]);
}
for (m = 0, m2 = 0, k = 0; k < 16; ++k) {
m += dc[k];
m2 += dc[k] * dc[k];
}
if (kThreshold * m2 < m * m) {
VP8SetIntra16Mode(it, 0); // DC16
} else {
const uint8_t modes[16] = { 0 }; // DC4
VP8SetIntra4Mode(it, modes);
}
return 0;
}
static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
int best_alpha = DEFAULT_ALPHA;
int smallest_alpha = 0;
int best_mode = 0;
const int max_mode = MAX_UV_MODE;
int mode;
VP8MakeChroma8Preds(it);
for (mode = 0; mode < max_mode; ++mode) {
VP8Histogram histo;
int alpha;
InitHistogram(&histo);
VP8CollectHistogram(it->yuv_in_ + U_OFF_ENC,
it->yuv_p_ + VP8UVModeOffsets[mode],
16, 16 + 4 + 4, &histo);
alpha = GetAlpha(&histo);
if (IS_BETTER_ALPHA(alpha, best_alpha)) {
best_alpha = alpha;
}
// The best prediction mode tends to be the one with the smallest alpha.
if (mode == 0 || alpha < smallest_alpha) {
smallest_alpha = alpha;
best_mode = mode;
}
}
VP8SetIntraUVMode(it, best_mode);
return best_alpha;
}
static void MBAnalyze(VP8EncIterator* const it,
int alphas[MAX_ALPHA + 1],
int* const alpha, int* const uv_alpha) {
const VP8Encoder* const enc = it->enc_;
int best_alpha, best_uv_alpha;
VP8SetIntra16Mode(it, 0); // default: Intra16, DC_PRED
VP8SetSkip(it, 0); // not skipped
VP8SetSegment(it, 0); // default segment, spec-wise.
if (enc->method_ <= 1) {
best_alpha = FastMBAnalyze(it);
} else {
best_alpha = MBAnalyzeBestIntra16Mode(it);
}
best_uv_alpha = MBAnalyzeBestUVMode(it);
// Final susceptibility mix
best_alpha = (3 * best_alpha + best_uv_alpha + 2) >> 2;
best_alpha = FinalAlphaValue(best_alpha);
alphas[best_alpha]++;
it->mb_->alpha_ = best_alpha; // for later remapping.
// Accumulate for later complexity analysis.
*alpha += best_alpha; // mixed susceptibility (not just luma)
*uv_alpha += best_uv_alpha;
}
static void DefaultMBInfo(VP8MBInfo* const mb) {
mb->type_ = 1; // I16x16
mb->uv_mode_ = 0;
mb->skip_ = 0; // not skipped
mb->segment_ = 0; // default segment
mb->alpha_ = 0;
}
//------------------------------------------------------------------------------
// Main analysis loop:
// Collect all susceptibilities for each macroblock and record their
// distribution in alphas[]. Segments is assigned a-posteriori, based on
// this histogram.
// We also pick an intra16 prediction mode, which shouldn't be considered
// final except for fast-encode settings. We can also pick some intra4 modes
// and decide intra4/intra16, but that's usually almost always a bad choice at
// this stage.
static void ResetAllMBInfo(VP8Encoder* const enc) {
int n;
for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
DefaultMBInfo(&enc->mb_info_[n]);
}
// Default susceptibilities.
enc->dqm_[0].alpha_ = 0;
enc->dqm_[0].beta_ = 0;
// Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
enc->alpha_ = 0;
enc->uv_alpha_ = 0;
WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
}
// struct used to collect job result
typedef struct {
WebPWorker worker;
int alphas[MAX_ALPHA + 1];
int alpha, uv_alpha;
VP8EncIterator it;
int delta_progress;
} SegmentJob;
// main work call
static int DoSegmentsJob(void* arg1, void* arg2) {
SegmentJob* const job = (SegmentJob*)arg1;
VP8EncIterator* const it = (VP8EncIterator*)arg2;
int ok = 1;
if (!VP8IteratorIsDone(it)) {
uint8_t tmp[32 + WEBP_ALIGN_CST];
uint8_t* const scratch = (uint8_t*)WEBP_ALIGN(tmp);
do {
// Let's pretend we have perfect lossless reconstruction.
VP8IteratorImport(it, scratch);
MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
ok = VP8IteratorProgress(it, job->delta_progress);
} while (ok && VP8IteratorNext(it));
}
return ok;
}
static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
int i;
for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
dst->alpha += src->alpha;
dst->uv_alpha += src->uv_alpha;
}
// initialize the job struct with some tasks to perform
static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
int start_row, int end_row) {
WebPGetWorkerInterface()->Init(&job->worker);
job->worker.data1 = job;
job->worker.data2 = &job->it;
job->worker.hook = DoSegmentsJob;
VP8IteratorInit(enc, &job->it);
VP8IteratorSetRow(&job->it, start_row);
VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
memset(job->alphas, 0, sizeof(job->alphas));
job->alpha = 0;
job->uv_alpha = 0;
// only one of both jobs can record the progress, since we don't
// expect the user's hook to be multi-thread safe
job->delta_progress = (start_row == 0) ? 20 : 0;
}
// main entry point
int VP8EncAnalyze(VP8Encoder* const enc) {
int ok = 1;
const int do_segments =
enc->config_->emulate_jpeg_size || // We need the complexity evaluation.
(enc->segment_hdr_.num_segments_ > 1) ||
(enc->method_ <= 1); // for method 0 - 1, we need preds_[] to be filled.
if (do_segments) {
const int last_row = enc->mb_h_;
// We give a little more than a half work to the main thread.
const int split_row = (9 * last_row + 15) >> 4;
const int total_mb = last_row * enc->mb_w_;
#ifdef WEBP_USE_THREAD
const int kMinSplitRow = 2; // minimal rows needed for mt to be worth it
const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
#else
const int do_mt = 0;
#endif
const WebPWorkerInterface* const worker_interface =
WebPGetWorkerInterface();
SegmentJob main_job;
if (do_mt) {
SegmentJob side_job;
// Note the use of '&' instead of '&&' because we must call the functions
// no matter what.
InitSegmentJob(enc, &main_job, 0, split_row);
InitSegmentJob(enc, &side_job, split_row, last_row);
// we don't need to call Reset() on main_job.worker, since we're calling
// WebPWorkerExecute() on it
ok &= worker_interface->Reset(&side_job.worker);
// launch the two jobs in parallel
if (ok) {
worker_interface->Launch(&side_job.worker);
worker_interface->Execute(&main_job.worker);
ok &= worker_interface->Sync(&side_job.worker);
ok &= worker_interface->Sync(&main_job.worker);
}
worker_interface->End(&side_job.worker);
if (ok) MergeJobs(&side_job, &main_job); // merge results together
} else {
// Even for single-thread case, we use the generic Worker tools.
InitSegmentJob(enc, &main_job, 0, last_row);
worker_interface->Execute(&main_job.worker);
ok &= worker_interface->Sync(&main_job.worker);
}
worker_interface->End(&main_job.worker);
if (ok) {
enc->alpha_ = main_job.alpha / total_mb;
enc->uv_alpha_ = main_job.uv_alpha / total_mb;
AssignSegments(enc, main_job.alphas);
}
} else { // Use only one default segment.
ResetAllMBInfo(enc);
}
return ok;
}

View File

@ -0,0 +1,790 @@
// Copyright 2017 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Improves a given set of backward references by analyzing its bit cost.
// The algorithm is similar to the Zopfli compression algorithm but tailored to
// images.
//
// Author: Vincent Rabaud (vrabaud@google.com)
//
#include <assert.h>
#include "enc_backward_references_enc.h"
#include "enc_histogram_enc.h"
#include "dsp_lossless_common.h"
#include "utils_color_cache_utils.h"
#include "utils_utils.h"
#define VALUES_IN_BYTE 256
extern void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
extern int VP8LDistanceToPlaneCode(int xsize, int dist);
extern void VP8LBackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
const PixOrCopy v);
typedef struct {
double alpha_[VALUES_IN_BYTE];
double red_[VALUES_IN_BYTE];
double blue_[VALUES_IN_BYTE];
double distance_[NUM_DISTANCE_CODES];
double* literal_;
} CostModel;
static void ConvertPopulationCountTableToBitEstimates(
int num_symbols, const uint32_t population_counts[], double output[]) {
uint32_t sum = 0;
int nonzeros = 0;
int i;
for (i = 0; i < num_symbols; ++i) {
sum += population_counts[i];
if (population_counts[i] > 0) {
++nonzeros;
}
}
if (nonzeros <= 1) {
memset(output, 0, num_symbols * sizeof(*output));
} else {
const double logsum = VP8LFastLog2(sum);
for (i = 0; i < num_symbols; ++i) {
output[i] = logsum - VP8LFastLog2(population_counts[i]);
}
}
}
static int CostModelBuild(CostModel* const m, int xsize, int cache_bits,
const VP8LBackwardRefs* const refs) {
int ok = 0;
VP8LRefsCursor c = VP8LRefsCursorInit(refs);
VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
if (histo == NULL) goto Error;
// The following code is similar to VP8LHistogramCreate but converts the
// distance to plane code.
VP8LHistogramInit(histo, cache_bits, /*init_arrays=*/ 1);
while (VP8LRefsCursorOk(&c)) {
VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos, VP8LDistanceToPlaneCode,
xsize);
VP8LRefsCursorNext(&c);
}
ConvertPopulationCountTableToBitEstimates(
VP8LHistogramNumCodes(histo->palette_code_bits_),
histo->literal_, m->literal_);
ConvertPopulationCountTableToBitEstimates(
VALUES_IN_BYTE, histo->red_, m->red_);
ConvertPopulationCountTableToBitEstimates(
VALUES_IN_BYTE, histo->blue_, m->blue_);
ConvertPopulationCountTableToBitEstimates(
VALUES_IN_BYTE, histo->alpha_, m->alpha_);
ConvertPopulationCountTableToBitEstimates(
NUM_DISTANCE_CODES, histo->distance_, m->distance_);
ok = 1;
Error:
VP8LFreeHistogram(histo);
return ok;
}
static WEBP_INLINE double GetLiteralCost(const CostModel* const m, uint32_t v) {
return m->alpha_[v >> 24] +
m->red_[(v >> 16) & 0xff] +
m->literal_[(v >> 8) & 0xff] +
m->blue_[v & 0xff];
}
static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
const int literal_idx = VALUES_IN_BYTE + NUM_LENGTH_CODES + idx;
return m->literal_[literal_idx];
}
static WEBP_INLINE double GetLengthCost(const CostModel* const m,
uint32_t length) {
int code, extra_bits;
VP8LPrefixEncodeBits(length, &code, &extra_bits);
return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
}
static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
uint32_t distance) {
int code, extra_bits;
VP8LPrefixEncodeBits(distance, &code, &extra_bits);
return m->distance_[code] + extra_bits;
}
static WEBP_INLINE void AddSingleLiteralWithCostModel(
const uint32_t* const argb, VP8LColorCache* const hashers,
const CostModel* const cost_model, int idx, int use_color_cache,
float prev_cost, float* const cost, uint16_t* const dist_array) {
double cost_val = prev_cost;
const uint32_t color = argb[idx];
const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
if (ix >= 0) {
// use_color_cache is true and hashers contains color
const double mul0 = 0.68;
cost_val += GetCacheCost(cost_model, ix) * mul0;
} else {
const double mul1 = 0.82;
if (use_color_cache) VP8LColorCacheInsert(hashers, color);
cost_val += GetLiteralCost(cost_model, color) * mul1;
}
if (cost[idx] > cost_val) {
cost[idx] = (float)cost_val;
dist_array[idx] = 1; // only one is inserted.
}
}
// -----------------------------------------------------------------------------
// CostManager and interval handling
// Empirical value to avoid high memory consumption but good for performance.
#define COST_CACHE_INTERVAL_SIZE_MAX 500
// To perform backward reference every pixel at index index_ is considered and
// the cost for the MAX_LENGTH following pixels computed. Those following pixels
// at index index_ + k (k from 0 to MAX_LENGTH) have a cost of:
// cost_ = distance cost at index + GetLengthCost(cost_model, k)
// and the minimum value is kept. GetLengthCost(cost_model, k) is cached in an
// array of size MAX_LENGTH.
// Instead of performing MAX_LENGTH comparisons per pixel, we keep track of the
// minimal values using intervals of constant cost.
// An interval is defined by the index_ of the pixel that generated it and
// is only useful in a range of indices from start_ to end_ (exclusive), i.e.
// it contains the minimum value for pixels between start_ and end_.
// Intervals are stored in a linked list and ordered by start_. When a new
// interval has a better value, old intervals are split or removed. There are
// therefore no overlapping intervals.
typedef struct CostInterval CostInterval;
struct CostInterval {
float cost_;
int start_;
int end_;
int index_;
CostInterval* previous_;
CostInterval* next_;
};
// The GetLengthCost(cost_model, k) are cached in a CostCacheInterval.
typedef struct {
double cost_;
int start_;
int end_; // Exclusive.
} CostCacheInterval;
// This structure is in charge of managing intervals and costs.
// It caches the different CostCacheInterval, caches the different
// GetLengthCost(cost_model, k) in cost_cache_ and the CostInterval's (whose
// count_ is limited by COST_CACHE_INTERVAL_SIZE_MAX).
#define COST_MANAGER_MAX_FREE_LIST 10
typedef struct {
CostInterval* head_;
int count_; // The number of stored intervals.
CostCacheInterval* cache_intervals_;
size_t cache_intervals_size_;
double cost_cache_[MAX_LENGTH]; // Contains the GetLengthCost(cost_model, k).
float* costs_;
uint16_t* dist_array_;
// Most of the time, we only need few intervals -> use a free-list, to avoid
// fragmentation with small allocs in most common cases.
CostInterval intervals_[COST_MANAGER_MAX_FREE_LIST];
CostInterval* free_intervals_;
// These are regularly malloc'd remains. This list can't grow larger than than
// size COST_CACHE_INTERVAL_SIZE_MAX - COST_MANAGER_MAX_FREE_LIST, note.
CostInterval* recycled_intervals_;
} CostManager;
static void CostIntervalAddToFreeList(CostManager* const manager,
CostInterval* const interval) {
interval->next_ = manager->free_intervals_;
manager->free_intervals_ = interval;
}
static int CostIntervalIsInFreeList(const CostManager* const manager,
const CostInterval* const interval) {
return (interval >= &manager->intervals_[0] &&
interval <= &manager->intervals_[COST_MANAGER_MAX_FREE_LIST - 1]);
}
static void CostManagerInitFreeList(CostManager* const manager) {
int i;
manager->free_intervals_ = NULL;
for (i = 0; i < COST_MANAGER_MAX_FREE_LIST; ++i) {
CostIntervalAddToFreeList(manager, &manager->intervals_[i]);
}
}
static void DeleteIntervalList(CostManager* const manager,
const CostInterval* interval) {
while (interval != NULL) {
const CostInterval* const next = interval->next_;
if (!CostIntervalIsInFreeList(manager, interval)) {
WebPSafeFree((void*)interval);
} // else: do nothing
interval = next;
}
}
static void CostManagerClear(CostManager* const manager) {
if (manager == NULL) return;
WebPSafeFree(manager->costs_);
WebPSafeFree(manager->cache_intervals_);
// Clear the interval lists.
DeleteIntervalList(manager, manager->head_);
manager->head_ = NULL;
DeleteIntervalList(manager, manager->recycled_intervals_);
manager->recycled_intervals_ = NULL;
// Reset pointers, count_ and cache_intervals_size_.
memset(manager, 0, sizeof(*manager));
CostManagerInitFreeList(manager);
}
static int CostManagerInit(CostManager* const manager,
uint16_t* const dist_array, int pix_count,
const CostModel* const cost_model) {
int i;
const int cost_cache_size = (pix_count > MAX_LENGTH) ? MAX_LENGTH : pix_count;
manager->costs_ = NULL;
manager->cache_intervals_ = NULL;
manager->head_ = NULL;
manager->recycled_intervals_ = NULL;
manager->count_ = 0;
manager->dist_array_ = dist_array;
CostManagerInitFreeList(manager);
// Fill in the cost_cache_.
manager->cache_intervals_size_ = 1;
manager->cost_cache_[0] = GetLengthCost(cost_model, 0);
for (i = 1; i < cost_cache_size; ++i) {
manager->cost_cache_[i] = GetLengthCost(cost_model, i);
// Get the number of bound intervals.
if (manager->cost_cache_[i] != manager->cost_cache_[i - 1]) {
++manager->cache_intervals_size_;
}
}
// With the current cost model, we usually have below 20 intervals.
// The worst case scenario with a cost model would be if every length has a
// different cost, hence MAX_LENGTH but that is impossible with the current
// implementation that spirals around a pixel.
assert(manager->cache_intervals_size_ <= MAX_LENGTH);
manager->cache_intervals_ = (CostCacheInterval*)WebPSafeMalloc(
manager->cache_intervals_size_, sizeof(*manager->cache_intervals_));
if (manager->cache_intervals_ == NULL) {
CostManagerClear(manager);
return 0;
}
// Fill in the cache_intervals_.
{
CostCacheInterval* cur = manager->cache_intervals_;
// Consecutive values in cost_cache_ are compared and if a big enough
// difference is found, a new interval is created and bounded.
cur->start_ = 0;
cur->end_ = 1;
cur->cost_ = manager->cost_cache_[0];
for (i = 1; i < cost_cache_size; ++i) {
const double cost_val = manager->cost_cache_[i];
if (cost_val != cur->cost_) {
++cur;
// Initialize an interval.
cur->start_ = i;
cur->cost_ = cost_val;
}
cur->end_ = i + 1;
}
}
manager->costs_ = (float*)WebPSafeMalloc(pix_count, sizeof(*manager->costs_));
if (manager->costs_ == NULL) {
CostManagerClear(manager);
return 0;
}
// Set the initial costs_ high for every pixel as we will keep the minimum.
for (i = 0; i < pix_count; ++i) manager->costs_[i] = 1e38f;
return 1;
}
// Given the cost and the position that define an interval, update the cost at
// pixel 'i' if it is smaller than the previously computed value.
static WEBP_INLINE void UpdateCost(CostManager* const manager, int i,
int position, float cost) {
const int k = i - position;
assert(k >= 0 && k < MAX_LENGTH);
if (manager->costs_[i] > cost) {
manager->costs_[i] = cost;
manager->dist_array_[i] = k + 1;
}
}
// Given the cost and the position that define an interval, update the cost for
// all the pixels between 'start' and 'end' excluded.
static WEBP_INLINE void UpdateCostPerInterval(CostManager* const manager,
int start, int end, int position,
float cost) {
int i;
for (i = start; i < end; ++i) UpdateCost(manager, i, position, cost);
}
// Given two intervals, make 'prev' be the previous one of 'next' in 'manager'.
static WEBP_INLINE void ConnectIntervals(CostManager* const manager,
CostInterval* const prev,
CostInterval* const next) {
if (prev != NULL) {
prev->next_ = next;
} else {
manager->head_ = next;
}
if (next != NULL) next->previous_ = prev;
}
// Pop an interval in the manager.
static WEBP_INLINE void PopInterval(CostManager* const manager,
CostInterval* const interval) {
if (interval == NULL) return;
ConnectIntervals(manager, interval->previous_, interval->next_);
if (CostIntervalIsInFreeList(manager, interval)) {
CostIntervalAddToFreeList(manager, interval);
} else { // recycle regularly malloc'd intervals too
interval->next_ = manager->recycled_intervals_;
manager->recycled_intervals_ = interval;
}
--manager->count_;
assert(manager->count_ >= 0);
}
// Update the cost at index i by going over all the stored intervals that
// overlap with i.
// If 'do_clean_intervals' is set to something different than 0, intervals that
// end before 'i' will be popped.
static WEBP_INLINE void UpdateCostAtIndex(CostManager* const manager, int i,
int do_clean_intervals) {
CostInterval* current = manager->head_;
while (current != NULL && current->start_ <= i) {
CostInterval* const next = current->next_;
if (current->end_ <= i) {
if (do_clean_intervals) {
// We have an outdated interval, remove it.
PopInterval(manager, current);
}
} else {
UpdateCost(manager, i, current->index_, current->cost_);
}
current = next;
}
}
// Given a current orphan interval and its previous interval, before
// it was orphaned (which can be NULL), set it at the right place in the list
// of intervals using the start_ ordering and the previous interval as a hint.
static WEBP_INLINE void PositionOrphanInterval(CostManager* const manager,
CostInterval* const current,
CostInterval* previous) {
assert(current != NULL);
if (previous == NULL) previous = manager->head_;
while (previous != NULL && current->start_ < previous->start_) {
previous = previous->previous_;
}
while (previous != NULL && previous->next_ != NULL &&
previous->next_->start_ < current->start_) {
previous = previous->next_;
}
if (previous != NULL) {
ConnectIntervals(manager, current, previous->next_);
} else {
ConnectIntervals(manager, current, manager->head_);
}
ConnectIntervals(manager, previous, current);
}
// Insert an interval in the list contained in the manager by starting at
// interval_in as a hint. The intervals are sorted by start_ value.
static WEBP_INLINE void InsertInterval(CostManager* const manager,
CostInterval* const interval_in,
float cost, int position, int start,
int end) {
CostInterval* interval_new;
if (start >= end) return;
if (manager->count_ >= COST_CACHE_INTERVAL_SIZE_MAX) {
// Serialize the interval if we cannot store it.
UpdateCostPerInterval(manager, start, end, position, cost);
return;
}
if (manager->free_intervals_ != NULL) {
interval_new = manager->free_intervals_;
manager->free_intervals_ = interval_new->next_;
} else if (manager->recycled_intervals_ != NULL) {
interval_new = manager->recycled_intervals_;
manager->recycled_intervals_ = interval_new->next_;
} else { // malloc for good
interval_new = (CostInterval*)WebPSafeMalloc(1, sizeof(*interval_new));
if (interval_new == NULL) {
// Write down the interval if we cannot create it.
UpdateCostPerInterval(manager, start, end, position, cost);
return;
}
}
interval_new->cost_ = cost;
interval_new->index_ = position;
interval_new->start_ = start;
interval_new->end_ = end;
PositionOrphanInterval(manager, interval_new, interval_in);
++manager->count_;
}
// Given a new cost interval defined by its start at position, its length value
// and distance_cost, add its contributions to the previous intervals and costs.
// If handling the interval or one of its subintervals becomes to heavy, its
// contribution is added to the costs right away.
static WEBP_INLINE void PushInterval(CostManager* const manager,
double distance_cost, int position,
int len) {
size_t i;
CostInterval* interval = manager->head_;
CostInterval* interval_next;
const CostCacheInterval* const cost_cache_intervals =
manager->cache_intervals_;
// If the interval is small enough, no need to deal with the heavy
// interval logic, just serialize it right away. This constant is empirical.
const int kSkipDistance = 10;
if (len < kSkipDistance) {
int j;
for (j = position; j < position + len; ++j) {
const int k = j - position;
float cost_tmp;
assert(k >= 0 && k < MAX_LENGTH);
cost_tmp = (float)(distance_cost + manager->cost_cache_[k]);
if (manager->costs_[j] > cost_tmp) {
manager->costs_[j] = cost_tmp;
manager->dist_array_[j] = k + 1;
}
}
return;
}
for (i = 0; i < manager->cache_intervals_size_ &&
cost_cache_intervals[i].start_ < len;
++i) {
// Define the intersection of the ith interval with the new one.
int start = position + cost_cache_intervals[i].start_;
const int end = position + (cost_cache_intervals[i].end_ > len
? len
: cost_cache_intervals[i].end_);
const float cost = (float)(distance_cost + cost_cache_intervals[i].cost_);
for (; interval != NULL && interval->start_ < end;
interval = interval_next) {
interval_next = interval->next_;
// Make sure we have some overlap
if (start >= interval->end_) continue;
if (cost >= interval->cost_) {
// When intervals are represented, the lower, the better.
// [**********************************************************[
// start end
// [----------------------------------[
// interval->start_ interval->end_
// If we are worse than what we already have, add whatever we have so
// far up to interval.
const int start_new = interval->end_;
InsertInterval(manager, interval, cost, position, start,
interval->start_);
start = start_new;
if (start >= end) break;
continue;
}
if (start <= interval->start_) {
if (interval->end_ <= end) {
// [----------------------------------[
// interval->start_ interval->end_
// [**************************************************************[
// start end
// We can safely remove the old interval as it is fully included.
PopInterval(manager, interval);
} else {
// [------------------------------------[
// interval->start_ interval->end_
// [*****************************[
// start end
interval->start_ = end;
break;
}
} else {
if (end < interval->end_) {
// [--------------------------------------------------------------[
// interval->start_ interval->end_
// [*****************************[
// start end
// We have to split the old interval as it fully contains the new one.
const int end_original = interval->end_;
interval->end_ = start;
InsertInterval(manager, interval, interval->cost_, interval->index_,
end, end_original);
interval = interval->next_;
break;
} else {
// [------------------------------------[
// interval->start_ interval->end_
// [*****************************[
// start end
interval->end_ = start;
}
}
}
// Insert the remaining interval from start to end.
InsertInterval(manager, interval, cost, position, start, end);
}
}
static int BackwardReferencesHashChainDistanceOnly(
int xsize, int ysize, const uint32_t* const argb, int cache_bits,
const VP8LHashChain* const hash_chain, const VP8LBackwardRefs* const refs,
uint16_t* const dist_array) {
int i;
int ok = 0;
int cc_init = 0;
const int pix_count = xsize * ysize;
const int use_color_cache = (cache_bits > 0);
const size_t literal_array_size =
sizeof(double) * (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
((cache_bits > 0) ? (1 << cache_bits) : 0));
const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
CostModel* const cost_model =
(CostModel*)WebPSafeCalloc(1ULL, cost_model_size);
VP8LColorCache hashers;
CostManager* cost_manager =
(CostManager*)WebPSafeMalloc(1ULL, sizeof(*cost_manager));
int offset_prev = -1, len_prev = -1;
double offset_cost = -1;
int first_offset_is_constant = -1; // initialized with 'impossible' value
int reach = 0;
if (cost_model == NULL || cost_manager == NULL) goto Error;
cost_model->literal_ = (double*)(cost_model + 1);
if (use_color_cache) {
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
if (!cc_init) goto Error;
}
if (!CostModelBuild(cost_model, xsize, cache_bits, refs)) {
goto Error;
}
if (!CostManagerInit(cost_manager, dist_array, pix_count, cost_model)) {
goto Error;
}
// We loop one pixel at a time, but store all currently best points to
// non-processed locations from this point.
dist_array[0] = 0;
// Add first pixel as literal.
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, 0, use_color_cache,
0.f, cost_manager->costs_, dist_array);
for (i = 1; i < pix_count; ++i) {
const float prev_cost = cost_manager->costs_[i - 1];
int offset, len;
VP8LHashChainFindCopy(hash_chain, i, &offset, &len);
// Try adding the pixel as a literal.
AddSingleLiteralWithCostModel(argb, &hashers, cost_model, i,
use_color_cache, prev_cost,
cost_manager->costs_, dist_array);
// If we are dealing with a non-literal.
if (len >= 2) {
if (offset != offset_prev) {
const int code = VP8LDistanceToPlaneCode(xsize, offset);
offset_cost = GetDistanceCost(cost_model, code);
first_offset_is_constant = 1;
PushInterval(cost_manager, prev_cost + offset_cost, i, len);
} else {
assert(offset_cost >= 0);
assert(len_prev >= 0);
assert(first_offset_is_constant == 0 || first_offset_is_constant == 1);
// Instead of considering all contributions from a pixel i by calling:
// PushInterval(cost_manager, prev_cost + offset_cost, i, len);
// we optimize these contributions in case offset_cost stays the same
// for consecutive pixels. This describes a set of pixels similar to a
// previous set (e.g. constant color regions).
if (first_offset_is_constant) {
reach = i - 1 + len_prev - 1;
first_offset_is_constant = 0;
}
if (i + len - 1 > reach) {
// We can only be go further with the same offset if the previous
// length was maxed, hence len_prev == len == MAX_LENGTH.
// TODO(vrabaud), bump i to the end right away (insert cache and
// update cost).
// TODO(vrabaud), check if one of the points in between does not have
// a lower cost.
// Already consider the pixel at "reach" to add intervals that are
// better than whatever we add.
int offset_j, len_j = 0;
int j;
assert(len == MAX_LENGTH || len == pix_count - i);
// Figure out the last consecutive pixel within [i, reach + 1] with
// the same offset.
for (j = i; j <= reach; ++j) {
VP8LHashChainFindCopy(hash_chain, j + 1, &offset_j, &len_j);
if (offset_j != offset) {
VP8LHashChainFindCopy(hash_chain, j, &offset_j, &len_j);
break;
}
}
// Update the cost at j - 1 and j.
UpdateCostAtIndex(cost_manager, j - 1, 0);
UpdateCostAtIndex(cost_manager, j, 0);
PushInterval(cost_manager, cost_manager->costs_[j - 1] + offset_cost,
j, len_j);
reach = j + len_j - 1;
}
}
}
UpdateCostAtIndex(cost_manager, i, 1);
offset_prev = offset;
len_prev = len;
}
ok = !refs->error_;
Error:
if (cc_init) VP8LColorCacheClear(&hashers);
CostManagerClear(cost_manager);
WebPSafeFree(cost_model);
WebPSafeFree(cost_manager);
return ok;
}
// We pack the path at the end of *dist_array and return
// a pointer to this part of the array. Example:
// dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
static void TraceBackwards(uint16_t* const dist_array,
int dist_array_size,
uint16_t** const chosen_path,
int* const chosen_path_size) {
uint16_t* path = dist_array + dist_array_size;
uint16_t* cur = dist_array + dist_array_size - 1;
while (cur >= dist_array) {
const int k = *cur;
--path;
*path = k;
cur -= k;
}
*chosen_path = path;
*chosen_path_size = (int)(dist_array + dist_array_size - path);
}
static int BackwardReferencesHashChainFollowChosenPath(
const uint32_t* const argb, int cache_bits,
const uint16_t* const chosen_path, int chosen_path_size,
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs) {
const int use_color_cache = (cache_bits > 0);
int ix;
int i = 0;
int ok = 0;
int cc_init = 0;
VP8LColorCache hashers;
if (use_color_cache) {
cc_init = VP8LColorCacheInit(&hashers, cache_bits);
if (!cc_init) goto Error;
}
VP8LClearBackwardRefs(refs);
for (ix = 0; ix < chosen_path_size; ++ix) {
const int len = chosen_path[ix];
if (len != 1) {
int k;
const int offset = VP8LHashChainFindOffset(hash_chain, i);
VP8LBackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
if (use_color_cache) {
for (k = 0; k < len; ++k) {
VP8LColorCacheInsert(&hashers, argb[i + k]);
}
}
i += len;
} else {
PixOrCopy v;
const int idx =
use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
if (idx >= 0) {
// use_color_cache is true and hashers contains argb[i]
// push pixel as a color cache index
v = PixOrCopyCreateCacheIdx(idx);
} else {
if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
v = PixOrCopyCreateLiteral(argb[i]);
}
VP8LBackwardRefsCursorAdd(refs, v);
++i;
}
}
ok = !refs->error_;
Error:
if (cc_init) VP8LColorCacheClear(&hashers);
return ok;
}
// Returns 1 on success.
extern int VP8LBackwardReferencesTraceBackwards(
int xsize, int ysize, const uint32_t* const argb, int cache_bits,
const VP8LHashChain* const hash_chain,
const VP8LBackwardRefs* const refs_src, VP8LBackwardRefs* const refs_dst);
int VP8LBackwardReferencesTraceBackwards(int xsize, int ysize,
const uint32_t* const argb,
int cache_bits,
const VP8LHashChain* const hash_chain,
const VP8LBackwardRefs* const refs_src,
VP8LBackwardRefs* const refs_dst) {
int ok = 0;
const int dist_array_size = xsize * ysize;
uint16_t* chosen_path = NULL;
int chosen_path_size = 0;
uint16_t* dist_array =
(uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
if (dist_array == NULL) goto Error;
if (!BackwardReferencesHashChainDistanceOnly(
xsize, ysize, argb, cache_bits, hash_chain, refs_src, dist_array)) {
goto Error;
}
TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
if (!BackwardReferencesHashChainFollowChosenPath(
argb, cache_bits, chosen_path, chosen_path_size, hash_chain,
refs_dst)) {
goto Error;
}
ok = 1;
Error:
WebPSafeFree(dist_array);
return ok;
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,240 @@
// Copyright 2012 Google Inc. All Rights Reserved.
//
// Use of this source code is governed by a BSD-style license
// that can be found in the COPYING file in the root of the source
// tree. An additional intellectual property rights grant can be found
// in the file PATENTS. All contributing project authors may
// be found in the AUTHORS file in the root of the source tree.
// -----------------------------------------------------------------------------
//
// Author: Jyrki Alakuijala (jyrki@google.com)
//
#ifndef WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
#define WEBP_ENC_BACKWARD_REFERENCES_ENC_H_
#include <assert.h>
#include <stdlib.h>
#include "webp_types.h"
#include "webp_encode.h"
#include "webp_format_constants.h"
#ifdef __cplusplus
extern "C" {
#endif
// The maximum allowed limit is 11.
#define MAX_COLOR_CACHE_BITS 10
// -----------------------------------------------------------------------------
// PixOrCopy
enum Mode {
kLiteral,
kCacheIdx,
kCopy,
kNone
};
typedef struct {
// mode as uint8_t to make the memory layout to be exactly 8 bytes.
uint8_t mode;
uint16_t len;
uint32_t argb_or_distance;
} PixOrCopy;
static WEBP_INLINE PixOrCopy PixOrCopyCreateCopy(uint32_t distance,
uint16_t len) {
PixOrCopy retval;
retval.mode = kCopy;
retval.argb_or_distance = distance;
retval.len = len;
return retval;
}
static WEBP_INLINE PixOrCopy PixOrCopyCreateCacheIdx(int idx) {
PixOrCopy retval;
assert(idx >= 0);
assert(idx < (1 << MAX_COLOR_CACHE_BITS));
retval.mode = kCacheIdx;
retval.argb_or_distance = idx;
retval.len = 1;
return retval;
}
static WEBP_INLINE PixOrCopy PixOrCopyCreateLiteral(uint32_t argb) {
PixOrCopy retval;
retval.mode = kLiteral;
retval.argb_or_distance = argb;
retval.len = 1;
return retval;
}
static WEBP_INLINE int PixOrCopyIsLiteral(const PixOrCopy* const p) {
return (p->mode == kLiteral);
}
static WEBP_INLINE int PixOrCopyIsCacheIdx(const PixOrCopy* const p) {
return (p->mode == kCacheIdx);
}
static WEBP_INLINE int PixOrCopyIsCopy(const PixOrCopy* const p) {
return (p->mode == kCopy);
}
static WEBP_INLINE uint32_t PixOrCopyLiteral(const PixOrCopy* const p,
int component) {
assert(p->mode == kLiteral);
return (p->argb_or_distance >> (component * 8)) & 0xff;
}
static WEBP_INLINE uint32_t PixOrCopyLength(const PixOrCopy* const p) {
return p->len;
}
static WEBP_INLINE uint32_t PixOrCopyCacheIdx(const PixOrCopy* const p) {
assert(p->mode == kCacheIdx);
assert(p->argb_or_distance < (1U << MAX_COLOR_CACHE_BITS));
return p->argb_or_distance;
}
static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
assert(p->mode == kCopy);
return p->argb_or_distance;
}
// -----------------------------------------------------------------------------
// VP8LHashChain
#define HASH_BITS 18
#define HASH_SIZE (1 << HASH_BITS)
// If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
// is used in VP8LHashChain.
#define MAX_LENGTH_BITS 12
#define WINDOW_SIZE_BITS 20
// We want the max value to be attainable and stored in MAX_LENGTH_BITS bits.
#define MAX_LENGTH ((1 << MAX_LENGTH_BITS) - 1)
#if MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32
#error "MAX_LENGTH_BITS + WINDOW_SIZE_BITS > 32"
#endif
typedef struct VP8LHashChain VP8LHashChain;
struct VP8LHashChain {
// The 20 most significant bits contain the offset at which the best match
// is found. These 20 bits are the limit defined by GetWindowSizeForHashChain
// (through WINDOW_SIZE = 1<<20).
// The lower 12 bits contain the length of the match. The 12 bit limit is
// defined in MaxFindCopyLength with MAX_LENGTH=4096.
uint32_t* offset_length_;
// This is the maximum size of the hash_chain that can be constructed.
// Typically this is the pixel count (width x height) for a given image.
int size_;
};
// Must be called first, to set size.
int VP8LHashChainInit(VP8LHashChain* const p, int size);
// Pre-compute the best matches for argb.
int VP8LHashChainFill(VP8LHashChain* const p, int quality,
const uint32_t* const argb, int xsize, int ysize,
int low_effort);
void VP8LHashChainClear(VP8LHashChain* const p); // release memory
static WEBP_INLINE int VP8LHashChainFindOffset(const VP8LHashChain* const p,
const int base_position) {
return p->offset_length_[base_position] >> MAX_LENGTH_BITS;
}
static WEBP_INLINE int VP8LHashChainFindLength(const VP8LHashChain* const p,
const int base_position) {
return p->offset_length_[base_position] & ((1U << MAX_LENGTH_BITS) - 1);
}
static WEBP_INLINE void VP8LHashChainFindCopy(const VP8LHashChain* const p,
int base_position,
int* const offset_ptr,
int* const length_ptr) {
*offset_ptr = VP8LHashChainFindOffset(p, base_position);
*length_ptr = VP8LHashChainFindLength(p, base_position);
}
// -----------------------------------------------------------------------------
// VP8LBackwardRefs (block-based backward-references storage)
// maximum number of reference blocks the image will be segmented into
#define MAX_REFS_BLOCK_PER_IMAGE 16
typedef struct PixOrCopyBlock PixOrCopyBlock; // forward declaration
typedef struct VP8LBackwardRefs VP8LBackwardRefs;
// Container for blocks chain
struct VP8LBackwardRefs {
int block_size_; // common block-size
int error_; // set to true if some memory error occurred
PixOrCopyBlock* refs_; // list of currently used blocks
PixOrCopyBlock** tail_; // for list recycling
PixOrCopyBlock* free_blocks_; // free-list
PixOrCopyBlock* last_block_; // used for adding new refs (internal)
};
// Initialize the object. 'block_size' is the common block size to store
// references (typically, width * height / MAX_REFS_BLOCK_PER_IMAGE).
void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size);
// Release memory for backward references.
void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs);
// Cursor for iterating on references content
typedef struct {
// public:
PixOrCopy* cur_pos; // current position
// private:
PixOrCopyBlock* cur_block_; // current block in the refs list
const PixOrCopy* last_pos_; // sentinel for switching to next block
} VP8LRefsCursor;
// Returns a cursor positioned at the beginning of the references list.
VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs);
// Returns true if cursor is pointing at a valid position.
static WEBP_INLINE int VP8LRefsCursorOk(const VP8LRefsCursor* const c) {
return (c->cur_pos != NULL);
}
// Move to next block of references. Internal, not to be called directly.
void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c);
// Move to next position, or NULL. Should not be called if !VP8LRefsCursorOk().
static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
assert(c != NULL);
assert(VP8LRefsCursorOk(c));
if (++c->cur_pos == c->last_pos_) VP8LRefsCursorNextBlock(c);
}
// -----------------------------------------------------------------------------
// Main entry points
enum VP8LLZ77Type {
kLZ77Standard = 1,
kLZ77RLE = 2,
kLZ77Box = 4
};
// Evaluates best possible backward references for specified quality.
// The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
// bits to use (passing 0 implies disabling the local color cache).
// The optimal cache bits is evaluated and set for the *cache_bits_best
// parameter with the matching refs_best.
// If do_no_cache == 0, refs is an array of 2 values and the best
// VP8LBackwardRefs is put in the first element.
// If do_no_cache != 0, refs is an array of 3 values and the best
// VP8LBackwardRefs is put in the first element, the best value with no-cache in
// the second element.
// In both cases, the last element is used as temporary internally.
WebPEncodingError VP8LGetBackwardReferences(
int width, int height, const uint32_t* const argb, int quality,
int low_effort, int lz77_types_to_try, int cache_bits_max, int do_no_cache,
const VP8LHashChain* const hash_chain, VP8LBackwardRefs* const refs,
int* const cache_bits_best);
#ifdef __cplusplus
}
#endif
#endif // WEBP_ENC_BACKWARD_REFERENCES_ENC_H_

Some files were not shown because too many files have changed in this diff Show More